diff --git a/Cargo.toml b/Cargo.toml index 14e36e4..896cf93 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "alignmentrs" -version = "0.10.0" +version = "0.10.3" authors = ["Kent Kawashima "] edition = "2018" diff --git a/alignmentrs/__init__.py b/alignmentrs/__init__.py index 6f091e0..20d2028 100644 --- a/alignmentrs/__init__.py +++ b/alignmentrs/__init__.py @@ -5,7 +5,7 @@ __author__ = 'Kent Kawashima' -__version__ = '0.10.0' +__version__ = '0.10.5' __all__ = [ # From dynamic library 'librs', diff --git a/alignmentrs/aln/alignment.py b/alignmentrs/aln/alignment.py index f48477d..ddaee5e 100644 --- a/alignmentrs/aln/alignment.py +++ b/alignmentrs/aln/alignment.py @@ -1,7 +1,9 @@ +""" Alignment class. """ + from collections import Counter -from copy import copy, deepcopy -import os -import inspect +# from copy import copy, deepcopy +# import os +# import inspect import warnings import pandas @@ -183,14 +185,10 @@ def _make_row_meta(self, data=None, ids=None, descriptions=None): # If descriptions is NOT specified but ids is specified, # use ids as index and return an empty DataFrame. elif (descriptions is None) and (ids is not None): - df = pandas.DataFrame([], index=ids) - df['description'] = [''] * len(ids) - return df + return pandas.DataFrame(None, index=ids) # If both descriptions and ids are not specified, # use default integer indexing and return an empty DataFrame. - df = pandas.DataFrame([], index=range(self.nrows)) - df['description'] = [''] * self.nrows - return df + return pandas.DataFrame(None, index=range(self.nrows)) def _make_col_meta(self, data=None, ids=None, descriptions=None): # Constructs column metadata using data, or @@ -296,7 +294,7 @@ def ncols(self): @property def ids(self): """list of str: Returns the list of identifiers.""" - return self.row_metadata.index.to_list() + return self.row_metadata.index.tolist() @property def sequences(self): diff --git a/alignmentrs/aln/col.py b/alignmentrs/aln/col.py index f1bc5e4..2e6c75f 100644 --- a/alignmentrs/aln/col.py +++ b/alignmentrs/aln/col.py @@ -1,6 +1,8 @@ -from copy import deepcopy +""" Classes for retrieving and removing data column by column. """ + +# from copy import deepcopy import numbers -import inspect +# import inspect import itertools import pandas diff --git a/alignmentrs/aln/mixins/serde.py b/alignmentrs/aln/mixins/serde.py index f1d6f1c..f74bf87 100644 --- a/alignmentrs/aln/mixins/serde.py +++ b/alignmentrs/aln/mixins/serde.py @@ -29,7 +29,14 @@ class FastaSerdeMixin: from a FASTA formatted file. """ @classmethod - def from_fasta(cls, path, name=None, parse_row_metadata=True, parse_description=True, column_metadata_decoders=None, column_metadata_regexp='c\|([A-Za-z0-9\s\.]+)=(\[[A-Za-z0-9\.\s,\"\']+\])', column_index_regexp='ci\|([A-Za-z0-9\s\.]+)=(\[[A-Za-z0-9\.\s,\"\']+\])', store_history=True, **kwargs): + def from_fasta( + cls, path, name=None, + # parse_row_metadata=True, + parse_description=True, + # column_metadata_decoders=None, + column_metadata_regexp='c\|([A-Za-z0-9\s\.]+)=(\[[A-Za-z0-9\.\s,\"\']+\])', + column_index_regexp='ci\|([A-Za-z0-9\s\.]+)=(\[[A-Za-z0-9\.\s,\"\']+\])', + store_history=True, **kwargs): """Create an Alignment object from a FASTA-formatted file. Parameters @@ -65,8 +72,9 @@ def from_fasta(cls, path, name=None, parse_row_metadata=True, parse_description match = re.search(column_index_regexp, metadata['descriptions'][0]) if match: key, value = match.groups() + # Convert text into a list using eval try: - value = eval(value) + value = cls._parse_str_to_list(value, 'infer') except SyntaxError: raise ValueError('Cannot construct Alignment from the given FASTA file: column index is malformed'.format(key)) # Put key-value pair into the dictionary @@ -78,10 +86,8 @@ def from_fasta(cls, path, name=None, parse_row_metadata=True, parse_description match_locations.append(match.span()) key, value = match.groups() # Convert text into a list using eval - # This is DANGEROUS and could open to exploitation. - # TODO: Add a prelimenary regex check to lessen vulnerability try: - value = eval(value) + value = cls._parse_str_to_list(value, 'infer') except SyntaxError: raise ValueError('Cannot construct Alignment from the given FASTA file: column metadata {} is malformed'.format(key)) # Put key-value pair into the dictionary @@ -159,6 +165,66 @@ def to_fasta(self, path=None, include_column_metadata=None, column_metadata_enco with open(path, 'w') as writer: print(fasta_str, file=writer) + @staticmethod + def _parse_str_to_list(string: str, item_type: type = 'infer'): + """ Returns a list by parsing a given string. The input string has to + expressed as like Python list syntax. + + Parameters + ---------- + string: str + A string to be converted into a list. Format should be Python + syntax of list object like "[1, 2, 3]". It has to starts with "[" + and ends with "]" and items have to be separated by ",". + item_type: type (default: str) + Type in which items in str-like list will be converted. For example, + "[1, 2, 3]" and int are passed to string and item_type variables + respectively, "[1, 2, 3]" will converted into [1, 2, 3] not + ["1", "2", "3"]. + + Return + ------ + A list version of the input string. + + """ + # Check if item_type variable is "type" type + if item_type != 'infer' and not isinstance(item_type, type): + raise TypeError('Invalid type: object constructor type should be '\ + 'passed to "item_type" variable.') + + # Check if sring is str + if not isinstance(string, str): + raise TypeError('Invalid type: "string" variable has to be str type.') + + # Check string format + if not string.startswith('['): + raise SyntaxError(f'Invalid syntax for conversion to a list. '\ + '{string} does not start with "[".') + if not string.endswith(']'): + raise SyntaxError(f'Invalid syntax for conversion to a list. '\ + '{string} does not end with "]".') + + # Convert into a list + if item_type == 'infer': + out_l = [] + for item in string.split('[')[1].split(']')[0].split(','): + try: + dat = int(item) + # e.g. int('1.1') gives "ValueError: invalid literal for int() + # with base 10: '1.1'" + except ValueError: + dat = float(item) + # e.g. float('a') gives "ValueError: could not convert string + # to float: 'a'" + except: + dat = item + + out_l.append(dat) + return out_l + + return [item_type(item) for item + in string.split('[')[1].split(']')[0].split(',')] + @staticmethod def _fasta_entry_formatter(sid, desc, seq, col_meta): # Formats the ID, description, stringed metadata, and sequence @@ -237,10 +303,10 @@ def to_dict(self, row_metadata=True, column_metadata=True): } if row_metadata: d['row_metadata'] = self.row_metadata.to_dict(orient='list') - d['row_metadata_index'] = self.row_metadata.index.to_list() + d['row_metadata_index'] = self.row_metadata.index.tolist() if column_metadata: d['column_metadata'] = self.column_metadata.to_dict(orient='list') - d['column_metadata_index'] = self.column_metadata.index.to_list() + d['column_metadata_index'] = self.column_metadata.index.tolist() return d class JsonSerdeMixin(DictSerdeMixin): @@ -366,7 +432,7 @@ def to_pickle(self, path=None, **kwargs): if not os.path.isdir(dirpath): raise OSError('{} does not exist'.format(dirpath)) with open(path, 'wb') as writer: - print(pickled, file=writer) + writer.write(pickled) def __getstate__(self): # This method gets called when the Alignment object @@ -435,7 +501,7 @@ def col_metadata_to_str(column_metadata, included_keys, encoders=None, template= for k, v in included_values ] str_index = [col_metadata_str_formatter( - 'index', column_metadata.index.to_list(), + 'index', column_metadata.index.tolist(), encoders['index'] if 'index' in encoders.keys() else None, index_template) ] diff --git a/alignmentrs/aln/mixins/tests/test_dict_serde.py b/alignmentrs/aln/mixins/tests/test_dict_serde.py index 2cd5a8a..1b7e512 100644 --- a/alignmentrs/aln/mixins/tests/test_dict_serde.py +++ b/alignmentrs/aln/mixins/tests/test_dict_serde.py @@ -57,9 +57,9 @@ def test_to_dict_with_row_col_meta(self): 'data': self.matrix.data, 'alignment_metadata': self.alignment_metadata, 'row_metadata': self.row_metadata.to_dict(orient='list'), - 'row_metadata_index': self.row_metadata.index.to_list(), + 'row_metadata_index': self.row_metadata.index.tolist(), 'column_metadata': self.column_metadata.to_dict(orient='list'), - 'column_metadata_index': self.column_metadata.index.to_list(), + 'column_metadata_index': self.column_metadata.index.tolist(), } assert exp_dict == test_dict, \ "expected and test dictionaries are not the same: {} != {}".format( @@ -74,7 +74,7 @@ def test_to_dict_with_row_meta(self): 'data': self.matrix.data, 'alignment_metadata': self.alignment_metadata, 'row_metadata': self.row_metadata.to_dict(orient='list'), - 'row_metadata_index': self.row_metadata.index.to_list(), + 'row_metadata_index': self.row_metadata.index.tolist(), } assert exp_dict == test_dict, \ "expected and test dictionaries are not the same: {} != {}".format( @@ -89,7 +89,7 @@ def test_to_dict_with_col_meta(self): 'data': self.matrix.data, 'alignment_metadata': self.alignment_metadata, 'column_metadata': self.column_metadata.to_dict(orient='list'), - 'column_metadata_index': self.column_metadata.index.to_list(), + 'column_metadata_index': self.column_metadata.index.tolist(), } assert exp_dict == test_dict, \ "expected and test dictionaries are not the same: {} != {}".format( @@ -102,9 +102,9 @@ def test_from_dict(self): 'data': self.matrix.data, 'alignment_metadata': self.alignment_metadata, 'row_metadata': self.row_metadata.to_dict(orient='list'), - 'row_metadata_index': self.row_metadata.index.to_list(), + 'row_metadata_index': self.row_metadata.index.tolist(), 'column_metadata': self.column_metadata.to_dict(orient='list'), - 'column_metadata_index': self.column_metadata.index.to_list(), + 'column_metadata_index': self.column_metadata.index.tolist(), } test_class = MockAlignment.from_dict(test_dict) exp_class = MockAlignment( diff --git a/alignmentrs/aln/mixins/tests/test_json_serde.py b/alignmentrs/aln/mixins/tests/test_json_serde.py index eabb41a..1036a25 100644 --- a/alignmentrs/aln/mixins/tests/test_json_serde.py +++ b/alignmentrs/aln/mixins/tests/test_json_serde.py @@ -59,9 +59,9 @@ def test_to_json_with_row_col_meta(self): 'data': self.matrix.data, 'alignment_metadata': self.alignment_metadata, 'row_metadata': self.row_metadata.to_dict(orient='list'), - 'row_metadata_index': self.row_metadata.index.to_list(), + 'row_metadata_index': self.row_metadata.index.tolist(), 'column_metadata': self.column_metadata.to_dict(orient='list'), - 'column_metadata_index': self.column_metadata.index.to_list(), + 'column_metadata_index': self.column_metadata.index.tolist(), }) assert exp_json == test_json, \ "expected and test json are not the same: {} != {}".format( @@ -76,7 +76,7 @@ def test_to_dict_with_row_meta(self): 'data': self.matrix.data, 'alignment_metadata': self.alignment_metadata, 'row_metadata': self.row_metadata.to_dict(orient='list'), - 'row_metadata_index': self.row_metadata.index.to_list(), + 'row_metadata_index': self.row_metadata.index.tolist(), }) assert exp_json == test_json, \ "expected and test dictionaries are not the same: {} != {}".format( @@ -91,7 +91,7 @@ def test_to_dict_with_col_meta(self): 'data': self.matrix.data, 'alignment_metadata': self.alignment_metadata, 'column_metadata': self.column_metadata.to_dict(orient='list'), - 'column_metadata_index': self.column_metadata.index.to_list(), + 'column_metadata_index': self.column_metadata.index.tolist(), }) assert exp_json == test_json, \ "expected and test dictionaries are not the same: {} != {}".format( @@ -104,9 +104,9 @@ def test_from_json(self): 'data': self.matrix.data, 'alignment_metadata': self.alignment_metadata, 'row_metadata': self.row_metadata.to_dict(orient='list'), - 'row_metadata_index': self.row_metadata.index.to_list(), + 'row_metadata_index': self.row_metadata.index.tolist(), 'column_metadata': self.column_metadata.to_dict(orient='list'), - 'column_metadata_index': self.column_metadata.index.to_list(), + 'column_metadata_index': self.column_metadata.index.tolist(), }) with tempfile.TemporaryFile(mode='r+') as f: f.write(exp_json) diff --git a/alignmentrs/aln/record.py b/alignmentrs/aln/record.py index 2b2a7dd..6d8708a 100644 --- a/alignmentrs/aln/record.py +++ b/alignmentrs/aln/record.py @@ -1,3 +1,5 @@ +""" Class for formatting sequence and metadata. """ + import pandas class Record: diff --git a/alignmentrs/aln/row.py b/alignmentrs/aln/row.py index 4493958..83313eb 100644 --- a/alignmentrs/aln/row.py +++ b/alignmentrs/aln/row.py @@ -1,7 +1,9 @@ -from copy import deepcopy -import numbers -import inspect -import itertools +""" Classes for retrieving and removing data row by row. """ + +# from copy import deepcopy +# import numbers +# import inspect +# import itertools import pandas diff --git a/alignmentrs/tests/test_utils.py b/alignmentrs/tests/test_utils.py new file mode 100644 index 0000000..e11bb80 --- /dev/null +++ b/alignmentrs/tests/test_utils.py @@ -0,0 +1,106 @@ +""" Unit test for utils functions. """ + +import tempfile +from nose.tools import * + +from alignmentrs.utils import fasta_file_to_lists, alignment_file_to_lists + +class TestFastaFileReader: + """ Unit tests for reading normal FASTA formatted text file. """ + def setup(self): + self.fp = tempfile.NamedTemporaryFile(mode='w+') + lines = [ + '>seq1 here_is_description1', + 'ATGCATGCATGC', + '>seq2 here_is_description2', + 'ATGCAAN-ATGCAAA', + '>marker1 here_is_description_for_marker1', + 'CCCCCCCCCCNNCCC', + '>marker2 here_is_description_for_marker2', + '000011110011' + ] + self.fp.write('\n'.join(lines)) + self.fp.seek(0) + self.path = self.fp.name + + def teardown(self): + self.fp.close() + + def test_fasta_file_to_lists(self): + """ Tests if fasta_file_to_lists function returns contents of + a FASTA file as expected. """ + fasta_d = fasta_file_to_lists(self.path, marker_kw='marker') + + exp_sample = { + 'ids': ['seq1', 'seq2'], + 'descriptions': ['here_is_description1', 'here_is_description2'], + 'sequences': ['ATGCATGCATGC', 'ATGCAAN-ATGCAAA'] + } + assert fasta_d['sample'] == exp_sample, \ + 'Sequence data read from file is not the same as expected: {}'\ + .format(fasta_d['sample']) + + exp_marker = { + 'ids': ['marker1', 'marker2'], + 'descriptions': [ + 'here_is_description_for_marker1', + 'here_is_description_for_marker2'], + 'sequences': ['CCCCCCCCCCNNCCC', '000011110011'] + } + assert fasta_d['marker'] == exp_marker, \ + 'Marker data read from file is not the same as expected: {}'\ + .format(fasta_d['marker']) + + @raises(AssertionError) + def test_alignment_file_assertion(self): + """ Tests if alignment_file_to_lists function detects error in sequence + lengths. """ + fasta_d = alignment_file_to_lists(self.path, marker_kw='marker') + +class TestAlignmentFileReader: + """ Unit tests for functions for reading alignment file. """ + def setup(self): + """ Create a dummy alignment file. """ + self.fp = tempfile.NamedTemporaryFile(mode='w+') + lines = [ + '>seq1 here_is_description1', + 'ATGCATGCATGC', + '>seq2 here_is_description2', + 'ATGCAAN-ATGC', + '>marker1 here_is_description_for_marker1', + 'CCCCCCCCCCNN', + '>marker2 here_is_description_for_marker2', + '000011110011' + ] + self.fp.write('\n'.join(lines)) + self.fp.seek(0) + self.path = self.fp.name + + + def teardown(self): + self.fp.close() + + def test_alignment_file_to_lists(self): + """ Tests if alignment_file_to_lists function returns contents of + a FASTA file as expected. """ + fasta_d = alignment_file_to_lists(self.path, marker_kw='marker') + + exp_sample = { + 'ids': ['seq1', 'seq2'], + 'descriptions': ['here_is_description1', 'here_is_description2'], + 'sequences': ['ATGCATGCATGC', 'ATGCAAN-ATGC'] + } + assert fasta_d['sample'] == exp_sample, \ + 'Sequence data read from file is not the same as expected: {}'\ + .format(fasta_d['sample']) + + exp_marker = { + 'ids': ['marker1', 'marker2'], + 'descriptions': [ + 'here_is_description_for_marker1', + 'here_is_description_for_marker2'], + 'sequences': ['CCCCCCCCCCNN', '000011110011'] + } + assert fasta_d['marker'] == exp_marker, \ + 'Marker data read from file is not the same as expected: {}'\ + .format(fasta_d['marker']) diff --git a/alignmentrs/utils.py b/alignmentrs/utils.py index a559623..8fe38c9 100644 --- a/alignmentrs/utils.py +++ b/alignmentrs/utils.py @@ -11,6 +11,7 @@ def idseq_to_display(ids, chunked_sequences, template='{name} {seq}', max_length=20, id_width=15, sequence_width=55): + ids = list(map(str, ids)) if not len(ids): return '' def chunked_fn(x): @@ -129,6 +130,34 @@ def fasta_file_to_lists(path, marker_kw=None): } } +def alignment_file_to_lists(path, marker_kw=None): + """ Reads a FASTA formatted text file to a list. Nucleotide or amino acid + sequences and marker sequences should be the same lengths. + + Parameters + ---------- + path : str + Location of FASTA file. + marker_kw : str + Keyword indicating the sample is a marker. + + Returns + ------- + dict + Contains list of ids, descriptions, and sequences for sample + and marker categories. + + """ + fasta = fasta_file_to_lists(path, marker_kw) + + # Check if lengths of sequences are the same + sequences = fasta['sample']['sequences'] + fasta['marker']['sequences'] + len_array = [len(seq) for seq in sequences] + assert min(len_array) == max(len_array), \ + 'Wrong sequence lengths: different lengths of sequences exist. '\ + '{} <= seq_len <= {}.'.format(min(len_array), max(len_array)) + + return fasta def parse_comment_list(comment_list: list): comments_d = dict() diff --git a/examples/01_Reading_alignments.ipynb b/examples/01_Reading_alignments.ipynb index 36d5319..154a95c 100644 --- a/examples/01_Reading_alignments.ipynb +++ b/examples/01_Reading_alignments.ipynb @@ -766,7 +766,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.2" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/examples/02a_Removing_rows.ipynb b/examples/02a_Removing_rows.ipynb index f630b84..94e0313 100644 --- a/examples/02a_Removing_rows.ipynb +++ b/examples/02a_Removing_rows.ipynb @@ -1127,7 +1127,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.2" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/examples/02b_Removing_columns.ipynb b/examples/02b_Removing_columns.ipynb index 55a9dc6..038ef2c 100644 --- a/examples/02b_Removing_columns.ipynb +++ b/examples/02b_Removing_columns.ipynb @@ -1266,7 +1266,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.2" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/examples/03a_Filtering_rows.ipynb b/examples/03a_Filtering_rows.ipynb index d777fb0..4e8ffe4 100644 --- a/examples/03a_Filtering_rows.ipynb +++ b/examples/03a_Filtering_rows.ipynb @@ -1110,7 +1110,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.2" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/setup.py b/setup.py index 57844a9..66b5881 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name='alignmentrs', author='Kent Kawashima', - version='0.10.0', + version='0.10.3', author_email='kentkawashima@gmail.com', description='Quickly read and manipulate multiple sequence alignments in Python', long_description=long_description, diff --git a/src/alignment.rs b/src/alignment.rs index e281bc6..cdf5539 100644 --- a/src/alignment.rs +++ b/src/alignment.rs @@ -125,7 +125,11 @@ impl SeqMatrix { /// Keep rows matching the specified row indices, and removes everything else. pub fn _retain_rows(&mut self, ids: Vec) -> Result<(), String> { - self._drop_rows(ids, true) + if ids.len() == 0 { + let ids: Vec = (0..self._nrows()).map(|i| i as i32).collect(); + return self._drop_rows(ids, false) + } + return self._drop_rows(ids, true) } /// Generalized method used to remove rows from the sequence matrix. @@ -255,7 +259,11 @@ impl SeqMatrix { /// Keep columns matching the specified columns indices, and removes everything else. pub fn _retain_cols(&mut self, ids: Vec) -> Result<(), String> { - self._drop_cols(ids, true) + if ids.len() == 0 { + let ids: Vec = (0..self._ncols()).map(|i| i as i32).collect(); + return self._drop_cols(ids, false) + } + return self._drop_cols(ids, true) } /// Generalized method used to remove columns from the sequence matrix.