#cython: language_level=3
cimport cython
import numpy as np
import os
import csv
import urllib.request
def load_and_parse_data(percent=100):
"""This function downloads, parses and returns the data
percent: float
The percentage of data to load. Load less data if
processing is too slow
Returns
=======
(ids, : list of IDs of the different patterns - one pattern per row
varieties, : list of the varieties that can be distinguished - one per column
data) : 2D numpy array of integers, -1, 0, 1, 2, which show whether
or not this pattern can distinguish the variety
"""
filename = "AppleGenotypes.csv"
if not os.path.exists(filename):
# The file does not exist - download it!
url = "https://raw.githubusercontent.com/chryswoods/minimalmarkers/main/example"
filename = "AppleGenotypes.csv"
urllib.request.urlretrieve(f"{url}/{filename}", filename)
# Now parse the data. This reads the data using the
# csv module, doing some formatting that is needed
# for this type of data
lines = open(filename, "r").readlines()
# try to discover the separator used for the file (should be a comma)
dialect = csv.Sniffer().sniff(lines[0], delimiters=[" ", ",", "\t"])
# the varieties are the column headers (minus the first column
# which is the ID code for the pattern)
varieties = []
for variety in list(csv.reader([lines[0]], dialect=dialect))[0][1:]:
varieties.append(variety.lstrip().rstrip())
ids = []
nrows = len(lines) - 1
ncols = len(varieties)
if percent != 100:
nrows = min(nrows, int(nrows * percent / 100.0))
data = np.full((nrows, ncols), -1, np.int8)
npatterns = 0
irow = np.full(ncols, -1, np.int8)
for i in range(1, nrows+1):
parts = list(csv.reader([lines[i]], dialect=dialect))[0]
if len(parts) != ncols+1:
print("WARNING - invalid row! "
f"'{parts}' : {len(parts)} vs {ncols}")
else:
ids.append(parts[0])
row = np.asarray(parts[1:], np.string_)
pattern = data[npatterns]
pattern[(row == b'0') | (row == b'A')] = 0
pattern[(row == b'1') | (row == b'AB')] = 1
pattern[(row == b'2') | (row == b'B')] = 2
npatterns += 1
# Verify that the data has the right dimensions
nrows = len(ids)
ncols = len(varieties)
assert nrows == data.shape[0]
assert ncols == data.shape[1]
return (ids, varieties, data)
##############
############## We will be speeding up the code below
##############
@cython.boundscheck(False)
def calculate_scores(data):
"""Calculate the score for each row. This is calculated
as the sum of pairs of values on each row where the values
are not equal to each other, and neither are equal to -1.
Returns
=======
scores : numpy array containing the scores
"""
cdef int nrows = data.shape[0]
cdef int ncols = data.shape[1]
# Here is the list of scores
scores = np.zeros(nrows)
cdef double[::1] scores_view = scores
cdef signed char[:,:] data_view = data
cdef int irow = 0
cdef signed char ival = 0
cdef signed char jval = 0
cdef int i = 0
cdef int j = 0
with nogil:
# Loop over all rows
for irow in range(0, nrows):
for i in range(0, ncols):
for j in range(i, ncols):
ival = data_view[irow, i]
jval = data_view[irow, j]
if ival != -1 and jval != -1 and ival != jval:
scores_view[irow] += 1
return scores
def get_index_of_best_score(scores):
"""Return the index of the best score from the passed list of scores"""
# Now find the pattern with the highest score
best_score = 0
best_pattern = None
for irow in range(0, len(scores)):
if scores[irow] > best_score:
best_pattern = irow
best_score = scores[irow]
return best_pattern
if __name__ == "__main__":
# Load the data to be processed
(ids, varieties, data) = load_and_parse_data(5)
# Calculate all of the scores
scores = calculate_scores(data)
# Find the best pattern
best_pattern = get_index_of_best_score(scores)
# Print the result
print(f"The best score {scores[best_pattern]} comes from pattern {ids[best_pattern]}")
python setup.py build_ext --inplace
import cyslow
(ids, varieties, data) = cyslow.load_and_parse_data(5)
timeit(cyslow.calculate_scores(data))
3.59 ms ± 23.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
(ids, varieties, data) = cyslow.load_and_parse_data(100)
timeit(cyslow.calculate_scores(data))
75.6 ms ± 13.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The cython-compiled calculate_scores takes ~3.6 ms to process 5% of the data, and ~76 ms to process 100% of the data.
This compares to ~3.1 seconds for the original Python code, and (co-incidentally) 3.59 ms / 77 ms for the serial numba-optimised function. In this case, the cython-compiled code runs at the same speed as the numba-compiled code.
time python cyslow_main.py
The best score 21931.0 comes from pattern MDC054122.001_13602
python cyslow_main.py 0.43s user 0.10s system 207% cpu 0.260 total
cython-compiled version is twice as fast, in total, compared to the numba version (0.260 s versus 0.585 s). The increase in speed is likely to be because the cython-compiled cyslow module does not need to be interpreted, as it is already compiled machine code. This reduces the start-up time of the script.