diff --git a/.gitignore b/.gitignore index 82f9275..21ea91e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,9 @@ __pycache__/ *.py[cod] *$py.class +# IDE +.idea + # C extensions *.so @@ -160,3 +163,7 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +# + +# syn nas +@eaDir/ diff --git a/cdm_data_loader_utils/__init__.py b/cdm_data_loader_utils/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/cdm_data_loader_utils/biodb/__init__.py b/cdm_data_loader_utils/biodb/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/cdm_data_loader_utils/biodb/ncbi.py b/cdm_data_loader_utils/biodb/ncbi.py new file mode 100755 index 0000000..e69de29 diff --git a/cdm_data_loader_utils/core/__init__.py b/cdm_data_loader_utils/core/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/cdm_data_loader_utils/core/genome.py b/cdm_data_loader_utils/core/genome.py new file mode 100755 index 0000000..891c63a --- /dev/null +++ b/cdm_data_loader_utils/core/genome.py @@ -0,0 +1,205 @@ +from modelseedpy.core.msgenome import MSGenome, read_fasta2 +from cdm_data_loader_utils.core.hash_seq import HashSeq, HashSeqList +from collections import Counter + + +class CDMProtocol: + + def __init__(self, protocol_id, parent_id, name, version, identifier, sim_t, inputs, outputs): + self.id = protocol_id + self.parent_id = parent_id + self.name = name + self.version = version + self.identifier = identifier + self.sim_t = sim_t + self.inputs = inputs + self.outputs = outputs + + +class CDMContigSet: + def __init__(self, hash_contigset): + self.hash_contigset = hash_contigset + self.contigs = [] + + @property + def size(self): + size = 0 + for contig in self.contigs: + size += len(contig.seq) + return size + + @staticmethod + def from_contigs(contigs: list): + h_list_contigs = HashSeqList() + for v in contigs: + h_list_contigs.append(HashSeq(str(v.seq))) + cdm_contigset = CDMContigSet(h_list_contigs.hash_value) + cdm_contigset.contigs.extend(contigs) + return cdm_contigset + + +class CDMContig: + def __init__(self, seq: str): + self.seq = seq + self.hash_contig = HashSeq(self.seq).hash_value + self.base_count = dict(Counter(list(self.seq.upper()))) + self.length = len(self.seq) + self.gc = ( + self.base_count.get("G", 0) + self.base_count.get("C", 0) + ) / self.length + + self.names = [] + + def __repr__(self): + return f"len: {self.length}, gc: {self.gc}, base_count: {self.base_count}, names: {self.names}" + + +class CDMProtein: + def __init__(self, seq: str): + self.stop_codon = False + _seq = seq + if _seq[-1] == "*": + _seq = _seq[:-1] + self.stop_codon = True + + self.seq = _seq + self.hash = HashSeq(self.seq).hash_value + self.length = len(self.seq) + + self.names = [] + + def __repr__(self): + return f"len: {self.length}, hash: {self.hash}" + + +class CDMFeature: + def __init__( + self, feature_id: str, contigset_x_contig_id: str, start: int, end: int, strand: str, + feature_type: str, source=None, phase=None, protocol=None, attributes=None + ): + self.id = feature_id + self.contigset_x_contig_id = contigset_x_contig_id + self.start = start + self.end = end + self.strand = strand + self.type = feature_type + self.source = source + self.cds_phase = phase + self.protocol = protocol + self.attributes = {} if attributes is None else attributes + + self.names = [] + + +class GffRecord: + def __init__( + self, + contig_id: str, + source: str, + feature_type, + start: int, + end: int, + score, + strand, + phase, + attr, + ): + self.contig_id = contig_id + self.source = source + self.feature_type = feature_type + self.start = start + self.end = end + self.score = score + self.strand = strand + self.phase = phase + self.attr = attr + + def get_attribute_string(self): + attr_values = [] + for k, v in self.attr.items(): + attr_values.append(f"{k}={v}") + return ";".join(attr_values) + + def __str__(self): + return "\t".join( + [ + str(x) + for x in [ + self.contig_id, + self.source, + self.feature_type, + self.start, + self.end, + self.score, + self.strand, + self.phase, + self.get_attribute_string(), + ] + ] + ) + + @staticmethod + def from_str(s): + ( + contig_id, + source, + feature_type, + start, + end, + score, + strand, + phase, + attr_str, + ) = s.strip().split("\t") + attr = dict([x.split("=") for x in attr_str.split(";")]) + return GffRecord( + contig_id, + source, + feature_type, + int(start), + int(end), + score, + strand, + phase, + attr, + ) + + +class REAssembly(MSGenome): + def __init__(self): + super().__init__() + self.hash_list = HashSeqList() + for contig in self.features: + seq = HashSeq(contig.seq) + self.hash_list.append(seq) + + def re(self): + pass + + def ke(self): + pass + + @staticmethod + def from_fasta(filename, split=" ", h_func=None): + genome = REAssembly() + genome.features += read_fasta2(filename, split, h_func) + return genome + + @property + def hash_value(self): + hl = HashSeqList() + for contig in self.features: + seq = HashSeq(contig.seq) + hl.append(seq) + return hl.hash_value + + @staticmethod + def _process_contigs(contigs): + hash_list = HashSeqList() + contig_h_d = [] + for contig in contigs.features: + seq = HashSeq(contig.seq) + hash_list.append(seq) + seq_h = seq.hash_value + contig_h_d.append([seq_h, contig.id, contig.description]) + return {"genome_h": hash_list.hash_value, "contig_h": contig_h_d} diff --git a/cdm_data_loader_utils/core/hash_seq.py b/cdm_data_loader_utils/core/hash_seq.py new file mode 100755 index 0000000..dd84a9b --- /dev/null +++ b/cdm_data_loader_utils/core/hash_seq.py @@ -0,0 +1,33 @@ +import hashlib + + +def _hash_string(s): + return hashlib.sha256(s.encode("utf-8")).hexdigest() + + +class HashSeq(str): + def __new__(cls, v): + # print('validate!!', v) + instance = super().__new__(cls, v.upper()) + return instance + + @property + def hash_value(self): + h = _hash_string(self) + return h + + +class HashSeqList(list): + def append(self, o, /): + if type(o) is str: + super().append(HashSeq(o)) + elif type(o) is HashSeq: + super().append(o) + else: + raise ValueError("bad type") + + @property + def hash_value(self): + h_list = [x.hash_value for x in self] + hash_seq = "_".join(sorted(h_list)) + return _hash_string(hash_seq) diff --git a/cdm_data_loader_utils/etl/load_mysql.py b/cdm_data_loader_utils/etl/load_mysql.py new file mode 100755 index 0000000..bb76d71 --- /dev/null +++ b/cdm_data_loader_utils/etl/load_mysql.py @@ -0,0 +1,223 @@ +import logging +import json +import polars as pl +from sqlalchemy import text +from sqlalchemy.orm import Session + +logger = logging.getLogger(__name__) + + +LOADER_CONFIGURATION = { + 'contigset': [ + ['hash_contigset'], + { + 'hash_contigset': (None, str), + 'n_contigs': (None, int), + 'size': (None, int), + } + ], + 'contig': [ + ['hash_contig'], + { + 'hash_contig': (None, str), + 'length': (None, int), + 'gc_content': (None, float), + 'base_count': (None, dict) + } + ], + 'contigset_x_contig': [ + ['hash_contigset_x_contig'], + { + 'hash_contigset_x_contig': (None, str), + 'hash_contigset': (None, str), + 'hash_contig': (None, str), + 'contig_index': (None, int) + } + ], + 'protocol': [ + ['protocol_id'], + { + 'protocol_id': (None, str), + 'parent_protocol_id': (None, str), + 'name': (None, str), + 'version': (None, str), + 'identifier': (None, str), + 'inputs': (None, str), + 'outputs': (None, str), + } + ], + 'feature': [ + ['feature_id'], + { + 'feature_id': (None, str), + 'hash_contigset_x_contig': (None, str), + 'source': (None, str), + 'protocol_id': (None, str), + 'type': (None, str), + 'start': (None, int), + 'end': (None, int), + 'strand': (None, str), + 'cds_phase': (None, int), + } + ], + 'protein': [ + ['hash_protein_sequence'], + { + 'hash_protein_sequence': (None, str), + 'length': (None, int), + 'sequence': (None, str) + } + ], + 'feature_x_protein': [ + ['feature_id', 'hash_protein_sequence'], + { + 'feature_id': (None, str), + 'hash_protein_sequence': (None, str), + 'has_stop_codon': (None, bool) + } + ] +} + + +class LoadSQL: + + def __init__(self, engine, table_name: str, columns_pk: list, columns_config: dict): + self.engine = engine + self.batch_size = 100000 + self.table_name = table_name + self.columns_pk = columns_pk + self.columns_config = columns_config + + @staticmethod + def build_from_config(engine, table_name, config): + columns_pk = config[0] + columns_config = config[1] + return LoadSQL(engine, table_name, columns_pk, columns_config) + + def load(self, filename, pos=0): + fn_fetch_keys = self.fetch_keys(self.table_name, self.columns_pk) + self._load(filename, self.table_name, fn_fetch_keys, self.columns_pk, self.columns_config, pos=pos) + + def fetch_keys(self, table_name, pk): + def fn_fetch_keys(): + table_ids = set() + with self.engine.connect() as con: + select_columns = ', '.join(pk) + query = f"SELECT {select_columns} FROM {table_name};" + rs = con.execute(text(query)) + for o in rs: + pk_tuple = tuple([o[i] for i in range(len(pk))]) + table_ids.add(pk_tuple) + return table_ids + + return fn_fetch_keys + + def load_contigset(self, filename, pos=0): + table_name = 'contigset' + pk = ['hash_contigset'] + columns = { + 'hash_contigset': (None, str), + 'n_contigs': (None, int), + 'size': (None, int), + } + fn_fetch_keys = self.fetch_keys(table_name, pk) + self._load(filename, table_name, fn_fetch_keys, pk, columns, pos) + + def load_contig(self, filename, pos=0): + table_name = 'contig' + pk = ['hash_contig'] + columns = { + 'hash_contig': (None, str), + 'length': (None, int), + 'gc_content': (None, float), + 'base_count': (None, dict) + } + fn_fetch_keys = self.fetch_keys(table_name, pk) + self._load(filename, table_name, fn_fetch_keys, pk, columns, pos) + + def load_contigset_x_contig(self, filename, pos=0): + table_name = 'contigset_x_contig' + pk = ['hash_contigset_x_contig'] + columns = { + 'hash_contigset_x_contig': (None, str), + 'hash_contigset': (None, str), + 'hash_contig': (None, str), + 'contig_index': (None, int) + } + fn_fetch_keys = self.fetch_keys(table_name, pk) + self._load(filename, table_name, fn_fetch_keys, pk, columns, pos) + + @staticmethod + def process_type(v, t): + if v is None: + return 'NULL' + elif t is str: + return f"'{v}'" + elif t is int: + return int(v) + elif t is float: + return float(v) + elif t is bool: + if v: + return 'TRUE' + else: + return 'FALSE' + elif t is dict: + json_str = json.dumps({k: v for k, v in v.items() if v is not None}) + return f"'{json_str}'" + raise ValueError(f'invalid type {t}') + + def process_rows(self, table_ids: set, rows: list, columns_pk: list, columns: dict): + + insert_columns = list(columns) + + sql_values = [] + for row in rows: + logger.debug(row) + + pk = tuple([row[x] for x in columns_pk]) + + logger.debug(pk) + + if pk not in table_ids: + table_ids.add(pk) + + sql_row = [] + for column_name in insert_columns: + df_id, column_type = columns[column_name] + v = row[column_name] if df_id is None else row[df_id] + sql_row.append(str(self.process_type(v, column_type))) + + sql_str = ', '.join(sql_row) + + logger.debug(sql_str) + sql_values.append(f'({sql_str})') + + return insert_columns, sql_values + + def _load(self, filename, table_name, fn_fetch_keys, columns_pk, columns, pos=0): + lazy_frame = pl.scan_parquet(filename) + df_size = lazy_frame.select(pl.len()).collect()['len'][0] + table_ids = fn_fetch_keys() + + print(df_size, len(table_ids)) + + while pos < df_size: + with Session(self.engine) as session: + rows = lazy_frame.slice(pos, self.batch_size).collect(streaming=False).rows(named=True) + insert_columns, sql_values = self.process_rows(table_ids, rows, columns_pk, columns) + + sql_columns = ', '.join(insert_columns) + if len(sql_values) > 0: + sql_insert = f""" + INSERT INTO {table_name} + ({sql_columns}) + VALUES + """ + sql_insert += ',\n'.join(sql_values) + sql_insert += ';' + print('payload len:', len(sql_insert)) + session.execute(text(sql_insert)) + session.commit() + pos += self.batch_size + print(pos) diff --git a/cdm_data_loader_utils/etl_genome.py b/cdm_data_loader_utils/etl_genome.py new file mode 100755 index 0000000..e0b7a9d --- /dev/null +++ b/cdm_data_loader_utils/etl_genome.py @@ -0,0 +1,1054 @@ +import pyarrow as pa +import pandas as pd +import os +from modelseedpy import MSGenome +from cdm_data_loader_utils.core.genome import ( + CDMContig, + CDMContigSet, + CDMFeature, + CDMProtein, +) +from tqdm import tqdm + + +def _get_feature_contig_kbase(feature): + contigs = {x[0] for x in feature.location} + if len(contigs) == 1: + return list(contigs)[0] + raise ValueError(f'Feature: {feature.id} - invalid contig location {contigs}') + + +def _get_feature_contig(feature): + _id = feature.id + _contig_name = _id[::-1].split("_", 1)[1][::-1] # reverse split once reverse again + return _contig_name + + +def _get_start_end_strand_s_attr(feature): + start, end, strand_s, attr = feature.description.split(" # ") + if start.startswith("# "): + start = int(start[1:].strip()) + end = int(end) + strand = "?" + if strand_s == "1": + strand = "+" + elif strand_s == "-1": + strand = "-" + else: + raise ValueError(f"bad strand: {strand_s}") + + if attr: + attr = dict(x.split("=") for x in attr.split(";")) + else: + attr = {} + + return start, end, strand, attr + + +def _get_start_end_strand_s_attr_kbase(feature): + for location in feature.location: + contig, p1, strand, p2 = location + start = p1 + end = p2 + if strand == '+': + start = p1 + end = p1 + p2 - 1 + elif strand == '-': + start = p1 - p2 + 1 + end = p1 + return start, end, strand, {} + + return + + +def _get_feature_id(feature): + return feature.id + + +class GbffAssemblyAndGenome: + + def __init__(self, gbff_records: list): + if gbff_records is None or len(gbff_records) == 0: + raise ValueError('empty gbff features') + + self.gbff_records = gbff_records + + @property + def contigs(self): + for contig in self.gbff_records: + yield contig + + @staticmethod + def from_file(filename): + from cdm_data_loader_utils.io.genome import read_gbff_records_from_file + return GbffAssemblyAndGenome(read_gbff_records_from_file(filename)) + + +class ETLGbffGenome: + + def __init__(self): + self._data_cdm_contig = {} + self._data_cdm_contigset_x_contig = set() + self._cdm_contigset = None + self._data_cdm_protein = {} + self._data_cdm_feature = {} + self._data_cdm_feature_x_protein = set() + self._data_feature_qualifiers = {} + self.feature_source = 'gbff' + self.qualifier_translation = 'translation' + self.feature_id_max_len = 62 + self.fn_feature_metadata = None + + def etl(self, genome_id, protocol_id, contig_feature_pairs): + self.etl_assembly([x[0] for x in contig_feature_pairs]) + self.etl_features(genome_id, protocol_id, contig_feature_pairs) + + def etl_assembly(self, contigs): + l_contig = [] + h_to_gbff_record = {} + for contig in contigs: + cdm_contig = contig if type(contig) == CDMContig else CDMContig(str(contig.seq)) + if cdm_contig.hash_contig not in self._data_cdm_contig: + self._data_cdm_contig[cdm_contig.hash_contig] = cdm_contig + if cdm_contig.hash_contig not in h_to_gbff_record: + h_to_gbff_record[cdm_contig.hash_contig] = [] + h_to_gbff_record[cdm_contig.hash_contig].append(contig) + l_contig.append(cdm_contig) + self._cdm_contigset = CDMContigSet.from_contigs(l_contig) + + return h_to_gbff_record + + def register_contigset_x_contig(self, cdm_contigset, cdm_contig, index_contig): + contigset_x_contig_id = ETLGbffGenome.get_contigset_x_contig_id(cdm_contigset, + cdm_contig, + index_contig) + + self._data_cdm_contigset_x_contig.add(( + contigset_x_contig_id, + cdm_contigset.hash_contigset, + cdm_contig.hash_contig, + index_contig + )) + + return contigset_x_contig_id + + def etl_features(self, genome_id, protocol_id, contig_feature_pairs): + h_to_contig_to_features = {} + for cdm_contig, features in contig_feature_pairs: + if cdm_contig.hash_contig not in h_to_contig_to_features: + h_to_contig_to_features[cdm_contig.hash_contig] = [] + h_to_contig_to_features[cdm_contig.hash_contig].append((cdm_contig, features)) + + # h_to_gbff_record = self.etl_assembly(contigs) + + repeat_counter = {} + for hash_contig in h_to_contig_to_features: + cdm_contig = self._data_cdm_contig[hash_contig] + index_contig = 0 + contigset_x_contig_id = self.register_contigset_x_contig(self._cdm_contigset, cdm_contig, index_contig) + + for _cdm_contig, features in h_to_contig_to_features[hash_contig]: + for gbff_feature in features: + feature_id = ETLGbffGenome._get_feature_id(genome_id, gbff_feature) + if feature_id in self._data_cdm_feature: + counter = repeat_counter.get(feature_id, 1) + repeat_counter[feature_id] = counter + 1 + feature_id = f'{feature_id}_{counter}' + cdm_feature = ETLGbffGenome.convert_to_cdm_feature(feature_id, + contigset_x_contig_id, + gbff_feature, + gbff_feature.type) + # set protocol + cdm_feature.protocol = protocol_id + # set source + cdm_feature.source = self.feature_source + self._data_feature_qualifiers[cdm_feature.id] = self.get_other_qualifiers(gbff_feature) + + if len(cdm_feature.id) < self.feature_id_max_len and cdm_feature.id not in self._data_cdm_feature: + self._data_cdm_feature[cdm_feature.id] = cdm_feature + else: + if len(cdm_feature.id) >= self.feature_id_max_len: + raise ValueError(f'x feature id {cdm_feature.id}') + else: + raise ValueError(f'duplicate feature id {cdm_feature.id}') + if gbff_feature.type == 'CDS': + cdm_protein = self.get_protein(gbff_feature) + if cdm_protein and cdm_protein.hash not in self._data_cdm_protein: + self._data_cdm_protein[cdm_protein.hash] = cdm_protein + self._data_cdm_feature_x_protein.add((cdm_feature.id, cdm_protein.hash)) + index_contig += 1 + + @staticmethod + def _get_feature_id(genome_id, f): + start = int(f.location.start) + end = int(f.location.end) + feature_id = f'{genome_id}_{f.type}_{start}_{end}' + if 'locus_tag' in f.qualifiers and len(f.qualifiers['locus_tag']) == 1: + locus = f.qualifiers['locus_tag'][0] + return f'{genome_id}_{locus}_{f.type}_{start}_{end}' + return feature_id + + @staticmethod + def get_contigset_x_contig_id(cdm_contigset, cdm_contig, index_contig): + contigset_x_contig_str = f'{cdm_contigset.hash_contigset}_{cdm_contig.hash_contig}_{index_contig}' + return ETLGbffGenome._hash_string(contigset_x_contig_str) + + + def get_other_qualifiers(self, f): + qualifiers = {} + for k in f.qualifiers: + if k not in {self.qualifier_translation}: + qualifiers[k] = f.qualifiers[k] + return qualifiers + + def get_protein(self, f): + translation = f.qualifiers.get(self.qualifier_translation) + if translation is None: + return None + if type(translation) == list and len(translation) == 1: + return CDMProtein(translation[0]) + raise ValueError(f'unable to get translation: {f.qualifiers}') + + @staticmethod + def get_strand(f_strand): + if f_strand == 1: + return '+' + elif f_strand == -1: + return '-' + raise ValueError('strand error') + + @staticmethod + def convert_to_cdm_feature(feature_id, contigset_x_contig_id, f, feature_type): + start = int(f.location.start) + end = int(f.location.end) + strand = ETLGbffGenome.get_strand(f.location.strand) + cdm_feature = CDMFeature(feature_id, contigset_x_contig_id, start, end, strand, feature_type) + return cdm_feature + + @staticmethod + def _hash_string(s): + import hashlib + return hashlib.sha256(s.encode("utf-8")).hexdigest() + + +class ETLAssemblyAndGenome(ETLGbffGenome): + + def __init__(self): + super().__init__() + self.feature_source = 'kbase' + + @staticmethod + def convert_to_cdm_feature(feature_id, contigset_x_contig_id, f, feature_type): + start, end, strand, attr = _get_start_end_strand_s_attr_kbase(f) + cdm_feature = CDMFeature(feature_id, contigset_x_contig_id, start, end, strand, feature_type) + return cdm_feature + + def register_contigset_x_contig(self, cdm_contigset, cdm_contig, index_contig): + contigset_x_contig_id = ETLGbffGenome.get_contigset_x_contig_id(cdm_contigset, + cdm_contig, + index_contig) + + self._data_cdm_contigset_x_contig.add(( + contigset_x_contig_id, + cdm_contigset.hash_contigset, + cdm_contig.hash_contig, + index_contig + )) + + return contigset_x_contig_id + + def get_protein(self, f): + seq = f.seq + if seq: + return CDMProtein(seq) + else: + return None + + def etl_features(self, genome_id, protocol_id, contig_feature_pairs): + h_to_contig_to_features = {} + for cdm_contig, features in contig_feature_pairs: + if cdm_contig.hash_contig not in h_to_contig_to_features: + h_to_contig_to_features[cdm_contig.hash_contig] = [] + h_to_contig_to_features[cdm_contig.hash_contig].append((cdm_contig, features)) + + for hash_contig in h_to_contig_to_features: + index_contig = 0 + cdm_contig = self._data_cdm_contig[hash_contig] + + contigset_x_contig_id = self.register_contigset_x_contig(self._cdm_contigset, cdm_contig, index_contig) + + for _cdm_contig, features in h_to_contig_to_features[hash_contig]: + for feature in features: + feature_id = f'{genome_id}_{feature.id}' + cdm_feature = ETLAssemblyAndGenome.convert_to_cdm_feature(feature_id, + contigset_x_contig_id, + feature, + 'CDS') + # set protocol + cdm_feature.protocol = protocol_id + # set source + cdm_feature.source = self.feature_source + + if self.fn_feature_metadata: + self._data_feature_qualifiers[cdm_feature.id] = self.fn_feature_metadata(feature) + + if len(cdm_feature.id) < self.feature_id_max_len and cdm_feature.id not in self._data_cdm_feature: + self._data_cdm_feature[cdm_feature.id] = cdm_feature + else: + if len(cdm_feature.id) >= self.feature_id_max_len: + raise ValueError(f'x feature id {cdm_feature.id}') + else: + raise ValueError(f'duplicate feature id {cdm_feature.id}') + + cdm_protein = self.get_protein(feature) + if cdm_protein and cdm_protein.hash not in self._data_cdm_protein: + self._data_cdm_protein[cdm_protein.hash] = cdm_protein + self._data_cdm_feature_x_protein.add((cdm_feature.id, cdm_protein.hash)) + index_contig += 1 + + +class MapAssemblyToGff: + + def __init__(self, assembly, gff_records): + from modelseedpy_ext.re.hash_seq import HashSeq + self.contig_id_to_feature_contig = {} + self.gff_index_to_contig = {} + self.contig_to_hash_seq = {} + + from collections import Counter + counter = Counter() + for gff_record in gff_records: + counter.update([gff_record.feature_type]) + + for gff_record in gff_records: + if gff_record.contig_id not in self.contig_id_to_feature_contig: + self.contig_id_to_feature_contig[gff_record.contig_id] = None + + if 'DNA' in counter: + for gff_record in gff_records: + if gff_record.feature_type == 'DNA': + feature_contig = MapAssemblyToGff.get_feature_contig(gff_record, assembly) + if gff_record.contig_id in self.contig_id_to_feature_contig: + self.contig_id_to_feature_contig[gff_record.contig_id] = feature_contig + if 'Alias' in gff_record.attr and gff_record.attr.get('Alias') in self.contig_id_to_feature_contig: + alias = gff_record.attr.get('Alias') + if self.contig_id_to_feature_contig[alias] is None: + self.contig_id_to_feature_contig[alias] = feature_contig + elif self.contig_id_to_feature_contig[alias] != feature_contig: + raise ValueError('!') + else: + for f in assembly.features: + if f.id in self.contig_id_to_feature_contig: + self.contig_id_to_feature_contig[f.id] = f + + for k in self.contig_id_to_feature_contig: + feature_contig = self.contig_id_to_feature_contig[k] + if feature_contig is None: + raise ValueError(f'Unmapped contig: {k}') + if feature_contig.id not in self.contig_to_hash_seq: + hash_contig = HashSeq(feature_contig.seq).hash_value + self.contig_to_hash_seq[feature_contig.id] = hash_contig + + for i in range(len(gff_records)): + gff_record = gff_records[i] + feature_contig = self.contig_id_to_feature_contig[gff_record.contig_id] + if feature_contig is None: + raise ValueError(f'Unable to find contig: {gff_record}') + + self.gff_index_to_contig[i] = feature_contig + + @staticmethod + def get_feature_contig(gff_record, assembly): + contig_id = gff_record.contig_id + if contig_id in assembly.features: + return assembly.features.get_by_id(contig_id) + else: + contig_id = gff_record.attr['Note'].split(',')[0][:-1] + if contig_id in assembly.features: + return assembly.features.get_by_id(contig_id) + else: + contig_id = gff_record.attr['Note'].split(" ")[0] + return assembly.features.get_by_id(contig_id) + + +class ETLAlexeyGenome: + _S = len('/scratch/shared/CDM/alexey_genomes_ranjan/genomes_final/genome_processing/') + _STRIP_LEN = -1 * len('_contigs.fasta') + + def __init__(self, sql_engine, exclude=None): + self.cdm_contigsets = {} + self.cdm_name_contigset = set() + self.cdm_name_contig = set() + self.genome_scan = {} + self.data_contigs = {} + self.data_features = {} + self.data_feature_to_protein = {} + self.cdm_name_feature = set() + self.cdm_feature_annotation = {} + self.sql_engine = sql_engine + self.exclude = exclude + if self.exclude is None: + self.exclude = set() + + @staticmethod + def swap_dict(d): + res = {} + for k, v in d.items(): + if v not in res: + res[v] = k + else: + raise ValueError('not one to one') + return res + + @staticmethod + def _a_sz(a): + _sz = 0 + for f in a.features: + _sz += len(f.seq) + return _sz + + def scan_genomes(self, path_contigs): + for path_contig in tqdm(path_contigs): + self.scan_genome(path_contig) + + df_alexey_genome = pd.read_sql("SELECT * FROM browser_genome;", self.sql_engine) + + alexey_unique_genome_names = set(df_alexey_genome['name']) + missing = alexey_unique_genome_names - set(self.genome_scan) + not_in_alexey = set(self.genome_scan) - alexey_unique_genome_names + both = set(self.genome_scan) & alexey_unique_genome_names + print(len(alexey_unique_genome_names), len(missing), len(not_in_alexey), len(both)) + + def scan_genome(self, path_contig): + _prefix_filename = path_contig[:ETLAlexeyGenome._STRIP_LEN] + path_faa = _prefix_filename + '_Prodigal_proteins.faa' + path_gff = _prefix_filename + '_Prodigal.gff' + + if os.path.exists(path_faa) and os.path.exists(path_gff): + _p = path_contig[ETLAlexeyGenome._S:].split('/') + base_name = _p[0] + base_name_v = _p[2] + if base_name_v not in self.genome_scan: + self.genome_scan[base_name_v] = { + 'path_contigs.fasta': path_contig, + 'path_faa': path_faa, + 'path_gff': path_gff + } + else: + print('!0', path_contig) + + def process_genomes(self): + df_alexey_genome = pd.read_sql("SELECT * FROM browser_genome;", self.sql_engine) + + alexey_unique_genome_names = set(df_alexey_genome['name']) + missing = alexey_unique_genome_names - set(self.genome_scan) + not_in_alexey = set(self.genome_scan) - alexey_unique_genome_names + both = set(self.genome_scan) & alexey_unique_genome_names + print(len(alexey_unique_genome_names), len(missing), len(not_in_alexey), len(both)) + + for row_id, d in tqdm(df_alexey_genome.iterrows()): + genome_name = d['name'] + if genome_name in both and genome_name not in self.exclude: + _scan_data = self.genome_scan[genome_name] + assembly = REAssembly.from_fasta(_scan_data['path_contigs.fasta']) + assembly_size = ETLAlexeyGenome._a_sz(assembly) + contigs = len(assembly.features) + contigs_ids = [x.id for x in assembly.features] + + match = assembly_size == d['size'] and contigs == d['contigs'] + if match: + id_genome_alexey = d['id'] + self.process_contigset(assembly, genome_name, id_genome_alexey) + + df_contigs = pd.read_sql( + f"SELECT * FROM browser_contig WHERE genome_id = {id_genome_alexey};", + self.sql_engine) + d_id_contig_alexey_to_contig_name = ETLAlexeyGenome.swap_dict( + df_contigs.set_index('id')['contig_id'].to_dict()) + #genome = MSGenome.from_fasta(_scan_data['path_faa']) + gff_records = _read_gff_features(_scan_data['path_gff']) + + from collections import Counter + counter = Counter() + for gff_record in gff_records: + counter.update([gff_record.feature_type]) + gff_feature_types = dict(counter) + + try: + if 'DNA' in gff_feature_types: + self.process_contigs(gff_records, assembly, d_id_contig_alexey_to_contig_name) + else: + self.process_contig_no_gff(assembly, d_id_contig_alexey_to_contig_name) + except Exception as ex: + print(f'I failed to parse this: {genome_name}') + raise ValueError(ex) + else: + print(genome_name) + + def process_contigset(self, assembly, contigset_name, alexey_id): + hash_contigset = assembly.hash_value + if hash_contigset not in self.cdm_contigsets: + self.cdm_contigsets[hash_contigset] = { + 'n_contigs': len(assembly.features), + 'length': ETLAlexeyGenome._a_sz(assembly), + 'alexey_db_id': [alexey_id] + } + else: + self.cdm_contigsets[hash_contigset]['alexey_db_id'].append(alexey_id) + + self.cdm_name_contigset.add((hash_contigset, 'contigset', contigset_name, 'alexey_db')) + + def process_contig_no_gff(self, assembly, contig_alias_to_alexey_id): + """ + process contigs without gff + + The alexey dict maps the alternative alias to the surrogate key from the alexey database + :param gff_records: + :param assembly: + :param contig_alias_to_alexey_id: + :return: + """ + scaffold_alias_to_contig_id = {} + + hash_contigset = assembly.hash_value + + # iterate all records look for DNA type feature and extract Alias Name and the original feature_id + # from the Note attr + for feature_contig in assembly.features: + contig_id = feature_contig.id + cdm_contig = CDMContig(hash_contigset, feature_contig.seq) + pk = (cdm_contig.hash_contig, cdm_contig.contig_set_id) + + if pk not in self.data_contigs: + self.data_contigs[pk] = cdm_contig + self.data_contigs[pk].alexey_id = set() + self.data_contigs[pk].names = set() + + self.data_contigs[pk].alexey_id.add(contig_alias_to_alexey_id[contig_id]) + self.data_contigs[pk].names.add(( + cdm_contig.hash_contig, cdm_contig.contig_set_id, + 'contig', contig_id, 'enigma', 'from _contig.fasta')) + + @staticmethod + def get_feature_contig(gff_record, assembly): + contig_id = gff_record.contig_id + if contig_id in assembly.features: + return assembly.features.get_by_id(contig_id) + else: + contig_id = gff_record.attr['Note'].split(',')[0][:-1] + if contig_id in assembly.features: + return assembly.features.get_by_id(contig_id) + else: + contig_id = gff_record.attr['Note'].split(" ")[0] + return assembly.features.get_by_id(contig_id) + + def process_contigs(self, gff_records, assembly, contig_alias_to_alexey_id): + """ + process contigs we need both the assembly object and the gff records + the gff records contain the alternative aliases given by KBase Prokka while + the assembly object has the contig sequence and the original feature_id + + The alexey dict maps the alternative alias to the surrogate key from the alexey database + :param gff_records: + :param assembly: + :param contig_alias_to_alexey_id: + :return: + """ + scaffold_alias_to_contig_id = {} + + hash_contigset = assembly.hash_value + + # iterate all records look for DNA type feature and extract Alias Name and the original feature_id + # from the Note attr + for gff_record in gff_records: + if gff_record.feature_type == 'DNA': + + alias = gff_record.attr.get('Alias') + gff_contig_id = gff_record.contig_id + alexey_db_id = contig_alias_to_alexey_id.get(alias) + if alexey_db_id is None: + alexey_db_id = contig_alias_to_alexey_id.get(gff_contig_id) + + if alias not in scaffold_alias_to_contig_id: + feature_contig = ETLAlexeyGenome.get_feature_contig(gff_record, assembly) + + cdm_contig = CDMContig(hash_contigset, feature_contig.seq) + pk = (cdm_contig.hash_contig, cdm_contig.contig_set_id) + + if pk not in self.data_contigs: + self.data_contigs[pk] = cdm_contig + self.data_contigs[pk].alexey_id = set() + self.data_contigs[pk].names = set() + + self.data_contigs[pk].alexey_id.add(alexey_db_id) + + self.data_contigs[pk].names.add(( + cdm_contig.hash_contig, cdm_contig.contig_set_id, + 'contig', alias, 'enigma', 'from _Prodigal.gff' + )) + self.data_contigs[pk].names.add(( + cdm_contig.hash_contig, cdm_contig.contig_set_id, + 'contig', feature_contig.id, 'enigma', 'from _contigs.fasta' + )) + if gff_record.attr['Name'] != alias: + self.data_contigs[pk].names.add(( + cdm_contig.hash_contig, cdm_contig.contig_set_id, + 'contig', + gff_record.attr['Name'], + 'enigma', + 'from _contigs.fasta' + )) + else: + raise ValueError('!') + + @staticmethod + def build_feature_name(genome_id, gff_record): + """ + if 'Name' in gff_record.attr: + attr_name = gff_record.attr.get('Name') + return f"{genome_id}_{gff_record.feature_type}_{attr_name}" + elif 'ID' in gff_record.attr: + attr_id = gff_record.attr.get('ID') + return f"{genome_id}_{gff_record.feature_type}_{attr_id}" + else: + """ + return f"{genome_id}_{gff_record.contig_id}_{gff_record.feature_type}_{gff_record.start}_{gff_record.end}_{gff_record.strand}" + + @staticmethod + def find_feature(genome, gff_record): + if 'Name' in gff_record.attr: + _name = gff_record.attr.get('Name') + if _name in genome.features: + return genome.features.get_by_id(_name) + else: + _id = gff_record.attr.get('ID') + if _id in genome.features: + return genome.features.get_by_id(_id) + else: + return genome.features.get_by_id(_id.split('.')[0]) + + _id = gff_record.attr.get('ID') + if _id in genome.features: + return genome.features.get_by_id(_id) + + search_id = '_'.join(gff_record.contig_id.split('_')[:-1]) + '_' + gff_record.attr.get('ID') + if search_id in genome.features: + return genome.features.get_by_id(search_id) + + search_id = gff_record.contig_id[:-1] + gff_record.attr.get('ID') + return genome.features.get_by_id(search_id) + + def process_features(self, genome_id, gff_records, assembly, genome, protocol_id): + from modelseedpy_ext.re.hash_seq import HashSeq + feature_function = {} # product + + def _add_name(cdm_feature, name): + if name: + self.cdm_name_feature.add((cdm_feature.id, 'feature', name, 'enigma', 'from _Prodigal.gff')) + + hash_contigset = assembly.hash_value + assembly_to_gff = MapAssemblyToGff(assembly, gff_records) + + for i in range(len(gff_records)): + gff_record = gff_records[i] + + feature_contig = assembly_to_gff.gff_index_to_contig[i] + hash_contig = assembly_to_gff.contig_to_hash_seq[feature_contig.id] + + feature_id = ETLAlexeyGenome.build_feature_name(genome_id, gff_record) + cdm_feature = CDMFeature(feature_id, hash_contig, hash_contigset, + gff_record.start, gff_record.end, gff_record.strand, + gff_record.feature_type, gff_record.source, gff_record.phase, + protocol_id, + gff_record.attr) + + _add_name(cdm_feature, gff_record.attr.get('Name')) + _add_name(cdm_feature, gff_record.attr.get('gene_synonym')) + annotation = gff_record.attr.get('product') + if annotation: + self.cdm_feature_annotation[cdm_feature.id] = annotation + + if cdm_feature.id not in self.data_features: + self.data_features[cdm_feature.id] = cdm_feature + else: + raise ValueError(f'id error {cdm_feature.id}') + + if gff_record.feature_type == 'CDS': + feature_protein = ETLAlexeyGenome.find_feature(genome, gff_record) + _add_name(cdm_feature, feature_protein.id) + + cdm_protein = CDMProtein(feature_protein.seq) + # map encoded protein + self.data_feature_to_protein[(cdm_feature.id, cdm_protein.hash)] = cdm_protein.stop_codon + + # print(feature_id, name_to_contig_hash[gff_record.contig_id]) + # print(gff_record.attr) + + def export_data_contigs(self): + names = ["hash_contig", "hash_contigset", "length", "gc_content", "base_count"] + _data = {k: [] for k in names} + + for o in tqdm(self.data_contigs.values()): + _data["hash_contigset"].append(o.contig_set_id) + _data["hash_contig"].append(o.hash_contig) + _data["length"].append(len(o.seq)) + _data["gc_content"].append(o.gc) + _data["base_count"].append(o.base_count) + + table = pa.Table.from_arrays([pa.array(_data[k]) for k in names], names=names) + return table + + def export_data_name_contigs(self): + _table_name_names = ["id_entity", "hash_contigset", "name_table", "name", "source", "description"] + _table_name_data = {k: [] for k in _table_name_names} + names = {} + for o in tqdm(self.data_contigs.values()): + for hash_contig, hash_contigset, name_table, name, source, description in o.names: + names[(hash_contig, name, source)] = ( + hash_contig, hash_contigset, name_table, name, source, description) + for hash_contig, hash_contigset, name_table, name, source, description in names.values(): + _table_name_data['id_entity'].append(hash_contig) + _table_name_data['hash_contigset'].append(hash_contigset) + _table_name_data['name_table'].append(name_table) + _table_name_data['name'].append(name) + _table_name_data['source'].append(source) + _table_name_data['description'].append(description) + table = pa.Table.from_arrays([pa.array(_table_name_data[k]) for k in _table_name_names], + names=_table_name_names) + return table + + +class EtlAssemblyAndGenome: + + def __init__(self, fn_get_feature_contig=None, fn_get_start_end_strand_s_attr=None, fn_get_feature_id=None): + + self.data_contig_set = [] + self.data_contigs = [] + + self.contig_name_to_hash = {} + + self.data_proteins = {} + self.data_features = {} + self.data_feature_to_protein = {} + + self.names = set() + + self._fn_get_feature_contig = fn_get_feature_contig + self._fn_get_start_end_strand_s_attr = fn_get_start_end_strand_s_attr + if self._fn_get_feature_contig is None: + self._fn_get_feature_contig = _get_feature_contig + if self._fn_get_start_end_strand_s_attr is None: + self._fn_get_start_end_strand_s_attr = _get_start_end_strand_s_attr + self._fn_get_feature_id = _get_feature_id + if fn_get_feature_id: + self._fn_get_feature_id = fn_get_feature_id + + def etl_assembly(self, g_assembly, contig_id_name_source='ncbi'): + contig_set_hash = g_assembly.hash_value + + cdm_contig_set = CDMContigSet(contig_set_hash) + + for feature in g_assembly.features: + cdm_contig = CDMContig(contig_set_hash, feature.seq) + if contig_id_name_source: + cdm_contig.names.append((contig_id_name_source, feature.id)) + cdm_contig_set.contigs.append(cdm_contig) + self.data_contigs.append(cdm_contig) + + for _t, _n in cdm_contig.names: + self.names.add((cdm_contig.hash_contig, _t, _n)) + if _n not in self.contig_name_to_hash: + self.contig_name_to_hash[_n] = cdm_contig.hash_contig + elif self.contig_name_to_hash[_n] != cdm_contig.hash_contig: + raise ValueError("dup") + + self.data_contig_set.append(cdm_contig_set) + + return contig_set_hash + + def features(self, genome: MSGenome, gff): + for f in genome.features: + yield f + + def etl_genome(self, genome: MSGenome, hash_contigset, gff=None): + + for f in self.features(): + feature_id = self._fn_get_feature_id(f) + _contig_name = self._fn_get_feature_contig(f) + _contig_name_hash = self.contig_name_to_hash[_contig_name] + + start, end, strand, attr = self._fn_get_start_end_strand_s_attr(f, gff) + + cdm_feature = CDMFeature( + feature_id, + _contig_name_hash, + hash_contigset, + start, + end, + strand, + attr, + ) + cdm_protein = CDMProtein(f.seq) + self.data_feature_to_protein[cdm_feature.id] = ( + cdm_protein.hash, + cdm_protein.stop_codon, + ) + if cdm_feature.id not in self.data_features: + self.data_features[cdm_feature.id] = cdm_feature + else: + raise ValueError("dup feature id !~!") + if cdm_protein.hash not in self.data_proteins: + self.data_proteins[cdm_protein.hash] = cdm_protein + + def export_protein(self): + names = ["hash_protein_sequence", "description", "length", "sequence", "evidence_for_existence"] + _data = {k: [] for k in names} + for i in tqdm(self.data_proteins): + o = self.data_proteins[i] + _data["hash_protein_sequence"].append(o.hash) + _data["length"].append(len(o.seq)) + _data["sequence"].append(o.seq) + _data["description"].append(None) + _data["evidence_for_existence"].append(None) + + pa_table = pa.Table.from_arrays( + [pa.array(_data[k]) for k in names], names=names + ) + return pa_table + + def export_names(self): + _label_names = ["entity_id", "type", "name"] + _data_names = {k: [] for k in _label_names} + for _hash, _type, _value in tqdm(self.names): + _data_names["entity_id"].append(_hash) + _data_names["type"].append(_type) + _data_names["name"].append(_value) + + table = pa.Table.from_arrays( + [pa.array(_data_names[k]) for k in _label_names], names=_label_names + ) + return table + + def export_data_contigset(self): + names = ["hash_contigset", "n_contigs"] + _data = {k: [] for k in names} + for o in tqdm(self.data_contig_set): + _data["hash_contigset"].append(o.hash_contigset) + _data["n_contigs"].append(len(o.contigs)) + + table = pa.Table.from_arrays([pa.array(_data[k]) for k in names], names=names) + return table + + def export_data_contigs(self): + names = ["hash_contigset", "hash_contig", "length", "gc_content", "base_count"] + _data = {k: [] for k in names} + + _label_names = ["entity_id", "type", "name"] + _data_names = {k: [] for k in _label_names} + for o in tqdm(self.data_contigs): + _data["hash_contigset"].append(o.contig_set_id) + _data["hash_contig"].append(o.hash_contig) + _data["length"].append(len(o.seq)) + _data["gc_content"].append(o.gc) + _data["base_count"].append(o.base_count) + + table = pa.Table.from_arrays([pa.array(_data[k]) for k in names], names=names) + return table + + def export_data_feature(self): + names = [ + "feature_id", + "hash_contig", + "hash_contigset", + "hash", + "source", + "type", + "start", + "end", + "strand", + "cds_phase", + "attributes", + ] + _data = {k: [] for k in names} + for i in tqdm(self.data_features): + o = self.data_features[i] + _data["feature_id"].append(o.id) + _data["hash_contig"].append(o.hash_contig) + _data["hash_contigset"].append(o.hash_contigset) + _data["hash"].append(None) + _data["source"].append(o.source) + _data["type"].append("CDS") + _data["start"].append(o.start) + _data["end"].append(o.end) + _data["strand"].append(o.strand) + _data["cds_phase"].append(None) + _data["attributes"].append(o.attributes) + + table = pa.Table.from_arrays([pa.array(_data[k]) for k in names], names=names) + return table + + def export_data_feature_x_protein(self): + names = ["feature_id", "hash_protein_sequence", "has_stop_codon"] + _data = {k: [] for k in names} + for feature_id in tqdm(self.data_feature_to_protein): + protein_hash, has_stop_codon = self.data_feature_to_protein[feature_id] + _data["feature_id"].append(feature_id) + _data["hash_protein_sequence"].append(protein_hash) + _data["has_stop_codon"].append(has_stop_codon) + + table = pa.Table.from_arrays([pa.array(_data[k]) for k in names], names=names) + return table + + +class TransformToParquet: + + def __init__(self): + self.data_contigsets = {} + self.data_contigs = {} + self.data_contigset_x_contig = set() + self.data_features = {} + self.data_proteins = {} + self.data_feature_x_protein = set() + + def transform(self, list_etl: list): + + for etl in list_etl: + cdm_contigset = etl._cdm_contigset + if cdm_contigset.hash_contigset not in self.data_contigsets: + self.data_contigsets[cdm_contigset.hash_contigset] = cdm_contigset + else: + print('!') + for h in etl._data_cdm_contig: + cdm_contig = etl._data_cdm_contig[h] + if cdm_contig.hash_contig not in self.data_contigs: + self.data_contigs[cdm_contig.hash_contig] = cdm_contig + else: + print('!') + for t in etl._data_cdm_contigset_x_contig: + if t not in self.data_contigset_x_contig: + self.data_contigset_x_contig.add(t) + else: + print('!') + + for f_id in etl._data_cdm_feature: + cdm_feature = etl._data_cdm_feature[f_id] + if f_id not in self.data_features: + self.data_features[f_id] = cdm_feature + else: + raise ValueError('feature') + + for p_id in etl._data_cdm_protein: + cdm_protein = etl._data_cdm_protein[p_id] + if p_id not in self.data_features: + self.data_proteins[p_id] = cdm_protein + else: + raise ValueError('protein') + + for t in etl._data_cdm_feature_x_protein: + if t not in self.data_feature_x_protein: + self.data_feature_x_protein.add(t) + + @staticmethod + def export_contigs(list_contig: list): + names = ["hash_contig", "length", "gc_content", "base_count"] + data = {k: [] for k in names} + + for o in tqdm(list_contig): + data["hash_contig"].append(o.hash_contig) + data["length"].append(o.length) + data["gc_content"].append(o.gc) + data["base_count"].append(o.base_count) + + table = pa.Table.from_arrays( + [pa.array(data[k]) for k in names], + names=names) + + return table + + @staticmethod + def export_contigsets(list_contigset: list): + names = ["hash_contigset", "n_contigs", "size"] + data = {k: [] for k in names} + + for o in tqdm(list_contigset): + data["hash_contigset"].append(o.hash_contigset) + data["n_contigs"].append(len(o.contigs)) + data["size"].append(o.size) + + table = pa.Table.from_arrays( + [pa.array(data[k]) for k in names], + names=names) + + return table + + @staticmethod + def export_contigset_x_contigs(list_contigset: list): + names = ["hash_contigset_x_contig", "hash_contigset", "hash_contig", "contig_index"] + data = {k: [] for k in names} + + for hash_contigset_x_contig, hash_contigset, hash_contig, contig_index in tqdm(list_contigset): + data["hash_contigset_x_contig"].append(hash_contigset_x_contig) + data["hash_contigset"].append(hash_contigset) + data["hash_contig"].append(hash_contig) + data["contig_index"].append(contig_index) + + table = pa.Table.from_arrays( + [pa.array(data[k]) for k in names], + names=names) + + return table + + def export_features(self, list_feature): + names = ["feature_id", "hash_contigset_x_contig", + "source", "protocol_id", "type", + "start", "end", "strand", "cds_phase"] + data = {k: [] for k in names} + + for o in tqdm(list_feature): + data["feature_id"].append(o.id) + data["hash_contigset_x_contig"].append(o.contigset_x_contig_id) + data["source"].append(o.source) + data["protocol_id"].append(o.protocol) + data["type"].append(o.type) + data["start"].append(o.start) + data["end"].append(o.end) + data["strand"].append(o.strand) + data["cds_phase"].append(o.cds_phase) + + table = pa.Table.from_arrays( + [pa.array(data[k]) for k in names], + names=names) + + return table + + def export_proteins(self, list_protein): + names = ["hash_protein_sequence", "description", + "length", "sequence"] + data = {k: [] for k in names} + + for o in tqdm(list_protein): + data["hash_protein_sequence"].append(o.hash) + data["description"].append(None) + data["length"].append(len(o.seq)) + data["sequence"].append(o.seq) + + table = pa.Table.from_arrays( + [pa.array(data[k]) for k in names], + names=names) + + return table + + def export_feature_x_proteins(self, list_feature_to_proteins): + names = ["feature_id", "hash_protein_sequence", "has_stop_codon"] + data = {k: [] for k in names} + + for feature_id, hash_protein in tqdm(list_feature_to_proteins): + data["feature_id"].append(feature_id) + data["hash_protein_sequence"].append(hash_protein) + data["has_stop_codon"].append(True) + + table = pa.Table.from_arrays( + [pa.array(data[k]) for k in names], + names=names) + + return table \ No newline at end of file diff --git a/cdm_data_loader_utils/io/__init__.py b/cdm_data_loader_utils/io/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/cdm_data_loader_utils/io/genome.py b/cdm_data_loader_utils/io/genome.py new file mode 100755 index 0000000..b3a8dfe --- /dev/null +++ b/cdm_data_loader_utils/io/genome.py @@ -0,0 +1,56 @@ +from Bio import SeqIO +from cdm_data_loader_utils.core.genome import CDMContig + + +def read_gbff_records_from_file(filename: str): + if filename.endswith(".gbff"): + with open(filename, 'r') as fh: + return read_gbff_records(fh) + elif filename.endswith(".gz"): + import gzip + from io import StringIO + with gzip.open(filename, 'rb') as fh: + return read_gbff_records(StringIO(fh.read().decode('utf-8'))) + + +def read_gbff_records(handler): + gbff_records = [] + for record in SeqIO.parse(handler, "gb"): + gbff_records.append(record) + return gbff_records + + +class ContigFeaturePairsBuilder: + + @staticmethod + def get_feature_contig_id(f): + ids = {x[0] for x in f.location} + if len(ids) == 1: + return list(ids)[0] + else: + raise ValueError('unable to fetch contig id') + + @staticmethod + def from_kbase(genome_assembly, genome_features): + contig_to_features = {} + for contig in genome_assembly.features: + contig_to_features[contig.id] = [] + for feature in genome_features.features: + contig_id = ContigFeaturePairsBuilder.get_feature_contig_id(feature) + contig_to_features[contig_id].append(feature) + contig_feature_pairs = [] + for contig_id, features in contig_to_features.items(): + contig = genome_assembly.features.get_by_id(contig_id) + cdm_contig = CDMContig(str(contig.seq)) + contig_feature_pairs.append((cdm_contig, features)) + + return contig_feature_pairs + + @staticmethod + def from_gbff(filename): + gbff_records = read_gbff_records_from_file(filename) + contig_feature_pairs = [] + for rec in gbff_records: + cdm_contig = CDMContig(str(rec.seq)) + contig_feature_pairs.append((cdm_contig, [f for f in rec.features])) + return contig_feature_pairs diff --git a/pyproject.toml b/pyproject.toml new file mode 100755 index 0000000..8e0e52d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,27 @@ +[build-system] +requires = [ + 'setuptools>=40.6.0', + 'wheel' +] +build-backend = "setuptools.build_meta" + +[tool.black] +line-length = 88 +python-version = ['py38'] +include = '\.pyi?$' +exclude = ''' +( + /( + \.eggs # exclude a few common directories in the + | \.git # root of the project + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + )/ +) +''' diff --git a/requirements.txt b/requirements.txt index f0918d2..0b617cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,10 @@ +tqdm biopython +modelseedpy==0.4.2 +sqlalchemy +pyarrow pandas +polars pytest coverage pytest-coverage diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..aa272e3 --- /dev/null +++ b/setup.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + +from setuptools import setup, find_packages + +with open("README.md") as f: + _readme = f.read() + +with open("LICENSE") as f: + _license = f.read() + +setup( + name="CDM-Data-Loader-Utils", + version="0.0.1", + description="CDM-Data-Loader-Utils", + long_description_content_type="text/x-rst", + long_description=_readme, + author="KBase developers", + author_email="KBase developers", + url="https://github.com/kbase/cdm-data-loader-utils", + license=_license, + packages=find_packages(exclude=("docs")), + package_data={ + + }, + classifiers=[ + "Development Status :: 3 - Alpha", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Intended Audience :: Science/Research", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Natural Language :: English", + ], + install_requires=[ + "pandas >= 2.2.2", + ], + tests_require=[ + "pytest", + ], + project_urls={ + "Documentation": "", + "Issues": "", + }, +) diff --git a/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.fna.gz b/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.fna.gz new file mode 100755 index 0000000..e21b3e7 Binary files /dev/null and b/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.fna.gz differ diff --git a/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gbff.gz b/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gbff.gz new file mode 100755 index 0000000..833ee39 Binary files /dev/null and b/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gbff.gz differ diff --git a/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gff.gz b/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gff.gz new file mode 100755 index 0000000..7d1c491 Binary files /dev/null and b/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gff.gz differ diff --git a/tests/data/genome/GCF_000005845.2_ASM584v2_protein.faa.gz b/tests/data/genome/GCF_000005845.2_ASM584v2_protein.faa.gz new file mode 100755 index 0000000..3a50efd Binary files /dev/null and b/tests/data/genome/GCF_000005845.2_ASM584v2_protein.faa.gz differ diff --git a/tests/data/genome_paths_file/valid.json b/tests/data/genome_paths_file/valid.json old mode 100644 new mode 100755 diff --git a/tests/src/parsers/test_etl_genome.py b/tests/src/parsers/test_etl_genome.py new file mode 100755 index 0000000..0e3150f --- /dev/null +++ b/tests/src/parsers/test_etl_genome.py @@ -0,0 +1,13 @@ +from cdm_data_loader_utils.io.genome import ContigFeaturePairsBuilder +from cdm_data_loader_utils.etl_genome import ETLGbffGenome + + +def test_etl_genome_gbff(): + pairs = ContigFeaturePairsBuilder.from_gbff('tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gbff.gz') + etl = ETLGbffGenome() + etl.etl('my_genome', 'some_protocol', pairs) + + assert etl._cdm_contigset is not None + assert len(etl._data_cdm_contig) == 1 + assert len(etl._data_cdm_contigset_x_contig) == 1 + assert len(etl._data_cdm_protein) == 4257 diff --git a/tests/src/parsers/test_io_genome.py b/tests/src/parsers/test_io_genome.py new file mode 100755 index 0000000..698a063 --- /dev/null +++ b/tests/src/parsers/test_io_genome.py @@ -0,0 +1,7 @@ +from cdm_data_loader_utils.io.genome import read_gbff_records_from_file + + +def test_read_gbff_records_from_file(): + records = read_gbff_records_from_file('tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gbff.gz') + + assert len(records) == 1