diff --git a/.gitignore b/.gitignore
index 82f9275..21ea91e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,9 @@ __pycache__/
 *.py[cod]
 *$py.class
 
+# IDE
+.idea
+
 # C extensions
 *.so
 
@@ -160,3 +163,7 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+#
+
+# syn nas
+@eaDir/
diff --git a/cdm_data_loader_utils/__init__.py b/cdm_data_loader_utils/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/cdm_data_loader_utils/biodb/__init__.py b/cdm_data_loader_utils/biodb/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/cdm_data_loader_utils/biodb/ncbi.py b/cdm_data_loader_utils/biodb/ncbi.py
new file mode 100755
index 0000000..e69de29
diff --git a/cdm_data_loader_utils/core/__init__.py b/cdm_data_loader_utils/core/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/cdm_data_loader_utils/core/genome.py b/cdm_data_loader_utils/core/genome.py
new file mode 100755
index 0000000..891c63a
--- /dev/null
+++ b/cdm_data_loader_utils/core/genome.py
@@ -0,0 +1,205 @@
+from modelseedpy.core.msgenome import MSGenome, read_fasta2
+from cdm_data_loader_utils.core.hash_seq import HashSeq, HashSeqList
+from collections import Counter
+
+
+class CDMProtocol:
+
+    def __init__(self, protocol_id, parent_id, name, version, identifier, sim_t, inputs, outputs):
+        self.id = protocol_id
+        self.parent_id = parent_id
+        self.name = name
+        self.version = version
+        self.identifier = identifier
+        self.sim_t = sim_t
+        self.inputs = inputs
+        self.outputs = outputs
+
+
+class CDMContigSet:
+    def __init__(self, hash_contigset):
+        self.hash_contigset = hash_contigset
+        self.contigs = []
+
+    @property
+    def size(self):
+        size = 0
+        for contig in self.contigs:
+            size += len(contig.seq)
+        return size
+
+    @staticmethod
+    def from_contigs(contigs: list):
+        h_list_contigs = HashSeqList()
+        for v in contigs:
+            h_list_contigs.append(HashSeq(str(v.seq)))
+        cdm_contigset = CDMContigSet(h_list_contigs.hash_value)
+        cdm_contigset.contigs.extend(contigs)
+        return cdm_contigset
+
+
+class CDMContig:
+    def __init__(self, seq: str):
+        self.seq = seq
+        self.hash_contig = HashSeq(self.seq).hash_value
+        self.base_count = dict(Counter(list(self.seq.upper())))
+        self.length = len(self.seq)
+        self.gc = (
+            self.base_count.get("G", 0) + self.base_count.get("C", 0)
+        ) / self.length
+
+        self.names = []
+
+    def __repr__(self):
+        return f"len: {self.length}, gc: {self.gc}, base_count: {self.base_count}, names: {self.names}"
+
+
+class CDMProtein:
+    def __init__(self, seq: str):
+        self.stop_codon = False
+        _seq = seq
+        if _seq[-1] == "*":
+            _seq = _seq[:-1]
+            self.stop_codon = True
+
+        self.seq = _seq
+        self.hash = HashSeq(self.seq).hash_value
+        self.length = len(self.seq)
+
+        self.names = []
+
+    def __repr__(self):
+        return f"len: {self.length}, hash: {self.hash}"
+
+
+class CDMFeature:
+    def __init__(
+        self, feature_id: str, contigset_x_contig_id: str, start: int, end: int, strand: str,
+        feature_type: str, source=None, phase=None, protocol=None, attributes=None
+    ):
+        self.id = feature_id
+        self.contigset_x_contig_id = contigset_x_contig_id
+        self.start = start
+        self.end = end
+        self.strand = strand
+        self.type = feature_type
+        self.source = source
+        self.cds_phase = phase
+        self.protocol = protocol
+        self.attributes = {} if attributes is None else attributes
+
+        self.names = []
+
+
+class GffRecord:
+    def __init__(
+        self,
+        contig_id: str,
+        source: str,
+        feature_type,
+        start: int,
+        end: int,
+        score,
+        strand,
+        phase,
+        attr,
+    ):
+        self.contig_id = contig_id
+        self.source = source
+        self.feature_type = feature_type
+        self.start = start
+        self.end = end
+        self.score = score
+        self.strand = strand
+        self.phase = phase
+        self.attr = attr
+
+    def get_attribute_string(self):
+        attr_values = []
+        for k, v in self.attr.items():
+            attr_values.append(f"{k}={v}")
+        return ";".join(attr_values)
+
+    def __str__(self):
+        return "\t".join(
+            [
+                str(x)
+                for x in [
+                    self.contig_id,
+                    self.source,
+                    self.feature_type,
+                    self.start,
+                    self.end,
+                    self.score,
+                    self.strand,
+                    self.phase,
+                    self.get_attribute_string(),
+                ]
+            ]
+        )
+
+    @staticmethod
+    def from_str(s):
+        (
+            contig_id,
+            source,
+            feature_type,
+            start,
+            end,
+            score,
+            strand,
+            phase,
+            attr_str,
+        ) = s.strip().split("\t")
+        attr = dict([x.split("=") for x in attr_str.split(";")])
+        return GffRecord(
+            contig_id,
+            source,
+            feature_type,
+            int(start),
+            int(end),
+            score,
+            strand,
+            phase,
+            attr,
+        )
+
+
+class REAssembly(MSGenome):
+    def __init__(self):
+        super().__init__()
+        self.hash_list = HashSeqList()
+        for contig in self.features:
+            seq = HashSeq(contig.seq)
+            self.hash_list.append(seq)
+
+    def re(self):
+        pass
+
+    def ke(self):
+        pass
+
+    @staticmethod
+    def from_fasta(filename, split=" ", h_func=None):
+        genome = REAssembly()
+        genome.features += read_fasta2(filename, split, h_func)
+        return genome
+
+    @property
+    def hash_value(self):
+        hl = HashSeqList()
+        for contig in self.features:
+            seq = HashSeq(contig.seq)
+            hl.append(seq)
+        return hl.hash_value
+
+    @staticmethod
+    def _process_contigs(contigs):
+        hash_list = HashSeqList()
+        contig_h_d = []
+        for contig in contigs.features:
+            seq = HashSeq(contig.seq)
+            hash_list.append(seq)
+            seq_h = seq.hash_value
+            contig_h_d.append([seq_h, contig.id, contig.description])
+        return {"genome_h": hash_list.hash_value, "contig_h": contig_h_d}
diff --git a/cdm_data_loader_utils/core/hash_seq.py b/cdm_data_loader_utils/core/hash_seq.py
new file mode 100755
index 0000000..dd84a9b
--- /dev/null
+++ b/cdm_data_loader_utils/core/hash_seq.py
@@ -0,0 +1,33 @@
+import hashlib
+
+
+def _hash_string(s):
+    return hashlib.sha256(s.encode("utf-8")).hexdigest()
+
+
+class HashSeq(str):
+    def __new__(cls, v):
+        # print('validate!!', v)
+        instance = super().__new__(cls, v.upper())
+        return instance
+
+    @property
+    def hash_value(self):
+        h = _hash_string(self)
+        return h
+
+
+class HashSeqList(list):
+    def append(self, o, /):
+        if type(o) is str:
+            super().append(HashSeq(o))
+        elif type(o) is HashSeq:
+            super().append(o)
+        else:
+            raise ValueError("bad type")
+
+    @property
+    def hash_value(self):
+        h_list = [x.hash_value for x in self]
+        hash_seq = "_".join(sorted(h_list))
+        return _hash_string(hash_seq)
diff --git a/cdm_data_loader_utils/etl/load_mysql.py b/cdm_data_loader_utils/etl/load_mysql.py
new file mode 100755
index 0000000..bb76d71
--- /dev/null
+++ b/cdm_data_loader_utils/etl/load_mysql.py
@@ -0,0 +1,223 @@
+import logging
+import json
+import polars as pl
+from sqlalchemy import text
+from sqlalchemy.orm import Session
+
+logger = logging.getLogger(__name__)
+
+
+LOADER_CONFIGURATION = {
+    'contigset': [
+        ['hash_contigset'],
+        {
+            'hash_contigset': (None, str),
+            'n_contigs': (None, int),
+            'size': (None, int),
+        }
+    ],
+    'contig': [
+        ['hash_contig'],
+        {
+            'hash_contig': (None, str),
+            'length': (None, int),
+            'gc_content': (None, float),
+            'base_count': (None, dict)
+        }
+    ],
+    'contigset_x_contig': [
+        ['hash_contigset_x_contig'],
+        {
+            'hash_contigset_x_contig': (None, str),
+            'hash_contigset': (None, str),
+            'hash_contig': (None, str),
+            'contig_index': (None, int)
+        }
+    ],
+    'protocol': [
+        ['protocol_id'],
+        {
+            'protocol_id': (None, str),
+            'parent_protocol_id': (None, str),
+            'name': (None, str),
+            'version': (None, str),
+            'identifier': (None, str),
+            'inputs': (None, str),
+            'outputs': (None, str),
+        }
+    ],
+    'feature': [
+        ['feature_id'],
+        {
+            'feature_id': (None, str),
+            'hash_contigset_x_contig': (None, str),
+            'source': (None, str),
+            'protocol_id': (None, str),
+            'type': (None, str),
+            'start': (None, int),
+            'end': (None, int),
+            'strand': (None, str),
+            'cds_phase': (None, int),
+        }
+    ],
+    'protein': [
+        ['hash_protein_sequence'],
+        {
+            'hash_protein_sequence': (None, str),
+            'length': (None, int),
+            'sequence': (None, str)
+        }
+    ],
+    'feature_x_protein': [
+        ['feature_id', 'hash_protein_sequence'],
+        {
+            'feature_id': (None, str),
+            'hash_protein_sequence': (None, str),
+            'has_stop_codon': (None, bool)
+        }
+    ]
+}
+
+
+class LoadSQL:
+
+    def __init__(self, engine, table_name: str, columns_pk: list, columns_config: dict):
+        self.engine = engine
+        self.batch_size = 100000
+        self.table_name = table_name
+        self.columns_pk = columns_pk
+        self.columns_config = columns_config
+
+    @staticmethod
+    def build_from_config(engine, table_name, config):
+        columns_pk = config[0]
+        columns_config = config[1]
+        return LoadSQL(engine, table_name, columns_pk, columns_config)
+
+    def load(self, filename, pos=0):
+        fn_fetch_keys = self.fetch_keys(self.table_name, self.columns_pk)
+        self._load(filename, self.table_name, fn_fetch_keys, self.columns_pk, self.columns_config, pos=pos)
+
+    def fetch_keys(self, table_name, pk):
+        def fn_fetch_keys():
+            table_ids = set()
+            with self.engine.connect() as con:
+                select_columns = ', '.join(pk)
+                query = f"SELECT {select_columns} FROM {table_name};"
+                rs = con.execute(text(query))
+                for o in rs:
+                    pk_tuple = tuple([o[i] for i in range(len(pk))])
+                    table_ids.add(pk_tuple)
+            return table_ids
+
+        return fn_fetch_keys
+
+    def load_contigset(self, filename, pos=0):
+        table_name = 'contigset'
+        pk = ['hash_contigset']
+        columns = {
+            'hash_contigset': (None, str),
+            'n_contigs': (None, int),
+            'size': (None, int),
+        }
+        fn_fetch_keys = self.fetch_keys(table_name, pk)
+        self._load(filename, table_name, fn_fetch_keys, pk, columns, pos)
+
+    def load_contig(self, filename, pos=0):
+        table_name = 'contig'
+        pk = ['hash_contig']
+        columns = {
+            'hash_contig': (None, str),
+            'length': (None, int),
+            'gc_content': (None, float),
+            'base_count': (None, dict)
+        }
+        fn_fetch_keys = self.fetch_keys(table_name, pk)
+        self._load(filename, table_name, fn_fetch_keys, pk, columns, pos)
+
+    def load_contigset_x_contig(self, filename, pos=0):
+        table_name = 'contigset_x_contig'
+        pk = ['hash_contigset_x_contig']
+        columns = {
+            'hash_contigset_x_contig': (None, str),
+            'hash_contigset': (None, str),
+            'hash_contig': (None, str),
+            'contig_index': (None, int)
+        }
+        fn_fetch_keys = self.fetch_keys(table_name, pk)
+        self._load(filename, table_name, fn_fetch_keys, pk, columns, pos)
+
+    @staticmethod
+    def process_type(v, t):
+        if v is None:
+            return 'NULL'
+        elif t is str:
+            return f"'{v}'"
+        elif t is int:
+            return int(v)
+        elif t is float:
+            return float(v)
+        elif t is bool:
+            if v:
+                return 'TRUE'
+            else:
+                return 'FALSE'
+        elif t is dict:
+            json_str = json.dumps({k: v for k, v in v.items() if v is not None})
+            return f"'{json_str}'"
+        raise ValueError(f'invalid type {t}')
+
+    def process_rows(self, table_ids: set, rows: list, columns_pk: list, columns: dict):
+
+        insert_columns = list(columns)
+
+        sql_values = []
+        for row in rows:
+            logger.debug(row)
+
+            pk = tuple([row[x] for x in columns_pk])
+
+            logger.debug(pk)
+
+            if pk not in table_ids:
+                table_ids.add(pk)
+
+                sql_row = []
+                for column_name in insert_columns:
+                    df_id, column_type = columns[column_name]
+                    v = row[column_name] if df_id is None else row[df_id]
+                    sql_row.append(str(self.process_type(v, column_type)))
+
+                sql_str = ', '.join(sql_row)
+
+                logger.debug(sql_str)
+                sql_values.append(f'({sql_str})')
+
+        return insert_columns, sql_values
+
+    def _load(self, filename, table_name, fn_fetch_keys, columns_pk, columns, pos=0):
+        lazy_frame = pl.scan_parquet(filename)
+        df_size = lazy_frame.select(pl.len()).collect()['len'][0]
+        table_ids = fn_fetch_keys()
+
+        print(df_size, len(table_ids))
+
+        while pos < df_size:
+            with Session(self.engine) as session:
+                rows = lazy_frame.slice(pos, self.batch_size).collect(streaming=False).rows(named=True)
+                insert_columns, sql_values = self.process_rows(table_ids, rows, columns_pk, columns)
+
+                sql_columns = ', '.join(insert_columns)
+                if len(sql_values) > 0:
+                    sql_insert = f"""
+                    INSERT INTO {table_name} 
+                      ({sql_columns}) 
+                    VALUES 
+                    """
+                    sql_insert += ',\n'.join(sql_values)
+                    sql_insert += ';'
+                    print('payload len:', len(sql_insert))
+                    session.execute(text(sql_insert))
+                    session.commit()
+            pos += self.batch_size
+            print(pos)
diff --git a/cdm_data_loader_utils/etl_genome.py b/cdm_data_loader_utils/etl_genome.py
new file mode 100755
index 0000000..e0b7a9d
--- /dev/null
+++ b/cdm_data_loader_utils/etl_genome.py
@@ -0,0 +1,1054 @@
+import pyarrow as pa
+import pandas as pd
+import os
+from modelseedpy import MSGenome
+from cdm_data_loader_utils.core.genome import (
+    CDMContig,
+    CDMContigSet,
+    CDMFeature,
+    CDMProtein,
+)
+from tqdm import tqdm
+
+
+def _get_feature_contig_kbase(feature):
+    contigs = {x[0] for x in feature.location}
+    if len(contigs) == 1:
+        return list(contigs)[0]
+    raise ValueError(f'Feature: {feature.id} - invalid contig location {contigs}')
+
+
+def _get_feature_contig(feature):
+    _id = feature.id
+    _contig_name = _id[::-1].split("_", 1)[1][::-1]  # reverse split once reverse again
+    return _contig_name
+
+
+def _get_start_end_strand_s_attr(feature):
+    start, end, strand_s, attr = feature.description.split(" # ")
+    if start.startswith("# "):
+        start = int(start[1:].strip())
+    end = int(end)
+    strand = "?"
+    if strand_s == "1":
+        strand = "+"
+    elif strand_s == "-1":
+        strand = "-"
+    else:
+        raise ValueError(f"bad strand: {strand_s}")
+
+    if attr:
+        attr = dict(x.split("=") for x in attr.split(";"))
+    else:
+        attr = {}
+
+    return start, end, strand, attr
+
+
+def _get_start_end_strand_s_attr_kbase(feature):
+    for location in feature.location:
+        contig, p1, strand, p2 = location
+        start = p1
+        end = p2
+        if strand == '+':
+            start = p1
+            end = p1 + p2 - 1
+        elif strand == '-':
+            start = p1 - p2 + 1
+            end = p1
+        return start, end, strand, {}
+
+    return
+
+
+def _get_feature_id(feature):
+    return feature.id
+
+
+class GbffAssemblyAndGenome:
+
+    def __init__(self, gbff_records: list):
+        if gbff_records is None or len(gbff_records) == 0:
+            raise ValueError('empty gbff features')
+
+        self.gbff_records = gbff_records
+
+    @property
+    def contigs(self):
+        for contig in self.gbff_records:
+            yield contig
+
+    @staticmethod
+    def from_file(filename):
+        from cdm_data_loader_utils.io.genome import read_gbff_records_from_file
+        return GbffAssemblyAndGenome(read_gbff_records_from_file(filename))
+
+
+class ETLGbffGenome:
+
+    def __init__(self):
+        self._data_cdm_contig = {}
+        self._data_cdm_contigset_x_contig = set()
+        self._cdm_contigset = None
+        self._data_cdm_protein = {}
+        self._data_cdm_feature = {}
+        self._data_cdm_feature_x_protein = set()
+        self._data_feature_qualifiers = {}
+        self.feature_source = 'gbff'
+        self.qualifier_translation = 'translation'
+        self.feature_id_max_len = 62
+        self.fn_feature_metadata = None
+
+    def etl(self, genome_id, protocol_id, contig_feature_pairs):
+        self.etl_assembly([x[0] for x in contig_feature_pairs])
+        self.etl_features(genome_id, protocol_id, contig_feature_pairs)
+
+    def etl_assembly(self, contigs):
+        l_contig = []
+        h_to_gbff_record = {}
+        for contig in contigs:
+            cdm_contig = contig if type(contig) == CDMContig else CDMContig(str(contig.seq))
+            if cdm_contig.hash_contig not in self._data_cdm_contig:
+                self._data_cdm_contig[cdm_contig.hash_contig] = cdm_contig
+            if cdm_contig.hash_contig not in h_to_gbff_record:
+                h_to_gbff_record[cdm_contig.hash_contig] = []
+            h_to_gbff_record[cdm_contig.hash_contig].append(contig)
+            l_contig.append(cdm_contig)
+        self._cdm_contigset = CDMContigSet.from_contigs(l_contig)
+
+        return h_to_gbff_record
+
+    def register_contigset_x_contig(self, cdm_contigset, cdm_contig, index_contig):
+        contigset_x_contig_id = ETLGbffGenome.get_contigset_x_contig_id(cdm_contigset,
+                                                                        cdm_contig,
+                                                                        index_contig)
+
+        self._data_cdm_contigset_x_contig.add((
+            contigset_x_contig_id,
+            cdm_contigset.hash_contigset,
+            cdm_contig.hash_contig,
+            index_contig
+        ))
+
+        return contigset_x_contig_id
+
+    def etl_features(self, genome_id, protocol_id, contig_feature_pairs):
+        h_to_contig_to_features = {}
+        for cdm_contig, features in contig_feature_pairs:
+            if cdm_contig.hash_contig not in h_to_contig_to_features:
+                h_to_contig_to_features[cdm_contig.hash_contig] = []
+            h_to_contig_to_features[cdm_contig.hash_contig].append((cdm_contig, features))
+
+        # h_to_gbff_record = self.etl_assembly(contigs)
+
+        repeat_counter = {}
+        for hash_contig in h_to_contig_to_features:
+            cdm_contig = self._data_cdm_contig[hash_contig]
+            index_contig = 0
+            contigset_x_contig_id = self.register_contigset_x_contig(self._cdm_contigset, cdm_contig, index_contig)
+
+            for _cdm_contig, features in h_to_contig_to_features[hash_contig]:
+                for gbff_feature in features:
+                    feature_id = ETLGbffGenome._get_feature_id(genome_id, gbff_feature)
+                    if feature_id in self._data_cdm_feature:
+                        counter = repeat_counter.get(feature_id, 1)
+                        repeat_counter[feature_id] = counter + 1
+                        feature_id = f'{feature_id}_{counter}'
+                    cdm_feature = ETLGbffGenome.convert_to_cdm_feature(feature_id,
+                                                                       contigset_x_contig_id,
+                                                                       gbff_feature,
+                                                                       gbff_feature.type)
+                    # set protocol
+                    cdm_feature.protocol = protocol_id
+                    # set source
+                    cdm_feature.source = self.feature_source
+                    self._data_feature_qualifiers[cdm_feature.id] = self.get_other_qualifiers(gbff_feature)
+
+                    if len(cdm_feature.id) < self.feature_id_max_len and cdm_feature.id not in self._data_cdm_feature:
+                        self._data_cdm_feature[cdm_feature.id] = cdm_feature
+                    else:
+                        if len(cdm_feature.id) >= self.feature_id_max_len:
+                            raise ValueError(f'x feature id {cdm_feature.id}')
+                        else:
+                            raise ValueError(f'duplicate feature id {cdm_feature.id}')
+                    if gbff_feature.type == 'CDS':
+                        cdm_protein = self.get_protein(gbff_feature)
+                        if cdm_protein and cdm_protein.hash not in self._data_cdm_protein:
+                            self._data_cdm_protein[cdm_protein.hash] = cdm_protein
+                            self._data_cdm_feature_x_protein.add((cdm_feature.id, cdm_protein.hash))
+                index_contig += 1
+
+    @staticmethod
+    def _get_feature_id(genome_id, f):
+        start = int(f.location.start)
+        end = int(f.location.end)
+        feature_id = f'{genome_id}_{f.type}_{start}_{end}'
+        if 'locus_tag' in f.qualifiers and len(f.qualifiers['locus_tag']) == 1:
+            locus = f.qualifiers['locus_tag'][0]
+            return f'{genome_id}_{locus}_{f.type}_{start}_{end}'
+        return feature_id
+
+    @staticmethod
+    def get_contigset_x_contig_id(cdm_contigset, cdm_contig, index_contig):
+        contigset_x_contig_str = f'{cdm_contigset.hash_contigset}_{cdm_contig.hash_contig}_{index_contig}'
+        return ETLGbffGenome._hash_string(contigset_x_contig_str)
+
+
+    def get_other_qualifiers(self, f):
+        qualifiers = {}
+        for k in f.qualifiers:
+            if k not in {self.qualifier_translation}:
+                qualifiers[k] = f.qualifiers[k]
+        return qualifiers
+
+    def get_protein(self, f):
+        translation = f.qualifiers.get(self.qualifier_translation)
+        if translation is None:
+            return None
+        if type(translation) == list and len(translation) == 1:
+            return CDMProtein(translation[0])
+        raise ValueError(f'unable to get translation: {f.qualifiers}')
+
+    @staticmethod
+    def get_strand(f_strand):
+        if f_strand == 1:
+            return '+'
+        elif f_strand == -1:
+            return '-'
+        raise ValueError('strand error')
+
+    @staticmethod
+    def convert_to_cdm_feature(feature_id, contigset_x_contig_id, f, feature_type):
+        start = int(f.location.start)
+        end = int(f.location.end)
+        strand = ETLGbffGenome.get_strand(f.location.strand)
+        cdm_feature = CDMFeature(feature_id, contigset_x_contig_id, start, end, strand, feature_type)
+        return cdm_feature
+
+    @staticmethod
+    def _hash_string(s):
+        import hashlib
+        return hashlib.sha256(s.encode("utf-8")).hexdigest()
+
+
+class ETLAssemblyAndGenome(ETLGbffGenome):
+
+    def __init__(self):
+        super().__init__()
+        self.feature_source = 'kbase'
+
+    @staticmethod
+    def convert_to_cdm_feature(feature_id, contigset_x_contig_id, f, feature_type):
+        start, end, strand, attr = _get_start_end_strand_s_attr_kbase(f)
+        cdm_feature = CDMFeature(feature_id, contigset_x_contig_id, start, end, strand, feature_type)
+        return cdm_feature
+
+    def register_contigset_x_contig(self, cdm_contigset, cdm_contig, index_contig):
+        contigset_x_contig_id = ETLGbffGenome.get_contigset_x_contig_id(cdm_contigset,
+                                                                        cdm_contig,
+                                                                        index_contig)
+
+        self._data_cdm_contigset_x_contig.add((
+            contigset_x_contig_id,
+            cdm_contigset.hash_contigset,
+            cdm_contig.hash_contig,
+            index_contig
+        ))
+
+        return contigset_x_contig_id
+
+    def get_protein(self, f):
+        seq = f.seq
+        if seq:
+            return CDMProtein(seq)
+        else:
+            return None
+
+    def etl_features(self, genome_id, protocol_id, contig_feature_pairs):
+        h_to_contig_to_features = {}
+        for cdm_contig, features in contig_feature_pairs:
+            if cdm_contig.hash_contig not in h_to_contig_to_features:
+                h_to_contig_to_features[cdm_contig.hash_contig] = []
+            h_to_contig_to_features[cdm_contig.hash_contig].append((cdm_contig, features))
+
+        for hash_contig in h_to_contig_to_features:
+            index_contig = 0
+            cdm_contig = self._data_cdm_contig[hash_contig]
+
+            contigset_x_contig_id = self.register_contigset_x_contig(self._cdm_contigset, cdm_contig, index_contig)
+
+            for _cdm_contig, features in h_to_contig_to_features[hash_contig]:
+                for feature in features:
+                    feature_id = f'{genome_id}_{feature.id}'
+                    cdm_feature = ETLAssemblyAndGenome.convert_to_cdm_feature(feature_id,
+                                                                              contigset_x_contig_id,
+                                                                              feature,
+                                                                              'CDS')
+                    # set protocol
+                    cdm_feature.protocol = protocol_id
+                    # set source
+                    cdm_feature.source = self.feature_source
+
+                    if self.fn_feature_metadata:
+                        self._data_feature_qualifiers[cdm_feature.id] = self.fn_feature_metadata(feature)
+
+                    if len(cdm_feature.id) < self.feature_id_max_len and cdm_feature.id not in self._data_cdm_feature:
+                        self._data_cdm_feature[cdm_feature.id] = cdm_feature
+                    else:
+                        if len(cdm_feature.id) >= self.feature_id_max_len:
+                            raise ValueError(f'x feature id {cdm_feature.id}')
+                        else:
+                            raise ValueError(f'duplicate feature id {cdm_feature.id}')
+
+                    cdm_protein = self.get_protein(feature)
+                    if cdm_protein and cdm_protein.hash not in self._data_cdm_protein:
+                        self._data_cdm_protein[cdm_protein.hash] = cdm_protein
+                        self._data_cdm_feature_x_protein.add((cdm_feature.id, cdm_protein.hash))
+            index_contig += 1
+
+
+class MapAssemblyToGff:
+
+    def __init__(self, assembly, gff_records):
+        from modelseedpy_ext.re.hash_seq import HashSeq
+        self.contig_id_to_feature_contig = {}
+        self.gff_index_to_contig = {}
+        self.contig_to_hash_seq = {}
+
+        from collections import Counter
+        counter = Counter()
+        for gff_record in gff_records:
+            counter.update([gff_record.feature_type])
+
+        for gff_record in gff_records:
+            if gff_record.contig_id not in self.contig_id_to_feature_contig:
+                self.contig_id_to_feature_contig[gff_record.contig_id] = None
+
+        if 'DNA' in counter:
+            for gff_record in gff_records:
+                if gff_record.feature_type == 'DNA':
+                    feature_contig = MapAssemblyToGff.get_feature_contig(gff_record, assembly)
+                    if gff_record.contig_id in self.contig_id_to_feature_contig:
+                        self.contig_id_to_feature_contig[gff_record.contig_id] = feature_contig
+                    if 'Alias' in gff_record.attr and gff_record.attr.get('Alias') in self.contig_id_to_feature_contig:
+                        alias = gff_record.attr.get('Alias')
+                        if self.contig_id_to_feature_contig[alias] is None:
+                            self.contig_id_to_feature_contig[alias] = feature_contig
+                        elif self.contig_id_to_feature_contig[alias] != feature_contig:
+                            raise ValueError('!')
+        else:
+            for f in assembly.features:
+                if f.id in self.contig_id_to_feature_contig:
+                    self.contig_id_to_feature_contig[f.id] = f
+
+        for k in self.contig_id_to_feature_contig:
+            feature_contig = self.contig_id_to_feature_contig[k]
+            if feature_contig is None:
+                raise ValueError(f'Unmapped contig: {k}')
+            if feature_contig.id not in self.contig_to_hash_seq:
+                hash_contig = HashSeq(feature_contig.seq).hash_value
+                self.contig_to_hash_seq[feature_contig.id] = hash_contig
+
+        for i in range(len(gff_records)):
+            gff_record = gff_records[i]
+            feature_contig = self.contig_id_to_feature_contig[gff_record.contig_id]
+            if feature_contig is None:
+                raise ValueError(f'Unable to find contig: {gff_record}')
+
+            self.gff_index_to_contig[i] = feature_contig
+
+    @staticmethod
+    def get_feature_contig(gff_record, assembly):
+        contig_id = gff_record.contig_id
+        if contig_id in assembly.features:
+            return assembly.features.get_by_id(contig_id)
+        else:
+            contig_id = gff_record.attr['Note'].split(',')[0][:-1]
+            if contig_id in assembly.features:
+                return assembly.features.get_by_id(contig_id)
+            else:
+                contig_id = gff_record.attr['Note'].split(" ")[0]
+                return assembly.features.get_by_id(contig_id)
+
+
+class ETLAlexeyGenome:
+    _S = len('/scratch/shared/CDM/alexey_genomes_ranjan/genomes_final/genome_processing/')
+    _STRIP_LEN = -1 * len('_contigs.fasta')
+
+    def __init__(self, sql_engine, exclude=None):
+        self.cdm_contigsets = {}
+        self.cdm_name_contigset = set()
+        self.cdm_name_contig = set()
+        self.genome_scan = {}
+        self.data_contigs = {}
+        self.data_features = {}
+        self.data_feature_to_protein = {}
+        self.cdm_name_feature = set()
+        self.cdm_feature_annotation = {}
+        self.sql_engine = sql_engine
+        self.exclude = exclude
+        if self.exclude is None:
+            self.exclude = set()
+
+    @staticmethod
+    def swap_dict(d):
+        res = {}
+        for k, v in d.items():
+            if v not in res:
+                res[v] = k
+            else:
+                raise ValueError('not one to one')
+        return res
+
+    @staticmethod
+    def _a_sz(a):
+        _sz = 0
+        for f in a.features:
+            _sz += len(f.seq)
+        return _sz
+
+    def scan_genomes(self, path_contigs):
+        for path_contig in tqdm(path_contigs):
+            self.scan_genome(path_contig)
+
+        df_alexey_genome = pd.read_sql("SELECT * FROM browser_genome;", self.sql_engine)
+
+        alexey_unique_genome_names = set(df_alexey_genome['name'])
+        missing = alexey_unique_genome_names - set(self.genome_scan)
+        not_in_alexey = set(self.genome_scan) - alexey_unique_genome_names
+        both = set(self.genome_scan) & alexey_unique_genome_names
+        print(len(alexey_unique_genome_names), len(missing), len(not_in_alexey), len(both))
+
+    def scan_genome(self, path_contig):
+        _prefix_filename = path_contig[:ETLAlexeyGenome._STRIP_LEN]
+        path_faa = _prefix_filename + '_Prodigal_proteins.faa'
+        path_gff = _prefix_filename + '_Prodigal.gff'
+
+        if os.path.exists(path_faa) and os.path.exists(path_gff):
+            _p = path_contig[ETLAlexeyGenome._S:].split('/')
+            base_name = _p[0]
+            base_name_v = _p[2]
+            if base_name_v not in self.genome_scan:
+                self.genome_scan[base_name_v] = {
+                    'path_contigs.fasta': path_contig,
+                    'path_faa': path_faa,
+                    'path_gff': path_gff
+                }
+            else:
+                print('!0', path_contig)
+
+    def process_genomes(self):
+        df_alexey_genome = pd.read_sql("SELECT * FROM browser_genome;", self.sql_engine)
+
+        alexey_unique_genome_names = set(df_alexey_genome['name'])
+        missing = alexey_unique_genome_names - set(self.genome_scan)
+        not_in_alexey = set(self.genome_scan) - alexey_unique_genome_names
+        both = set(self.genome_scan) & alexey_unique_genome_names
+        print(len(alexey_unique_genome_names), len(missing), len(not_in_alexey), len(both))
+
+        for row_id, d in tqdm(df_alexey_genome.iterrows()):
+            genome_name = d['name']
+            if genome_name in both and genome_name not in self.exclude:
+                _scan_data = self.genome_scan[genome_name]
+                assembly = REAssembly.from_fasta(_scan_data['path_contigs.fasta'])
+                assembly_size = ETLAlexeyGenome._a_sz(assembly)
+                contigs = len(assembly.features)
+                contigs_ids = [x.id for x in assembly.features]
+
+                match = assembly_size == d['size'] and contigs == d['contigs']
+                if match:
+                    id_genome_alexey = d['id']
+                    self.process_contigset(assembly, genome_name, id_genome_alexey)
+
+                    df_contigs = pd.read_sql(
+                        f"SELECT * FROM browser_contig WHERE genome_id = {id_genome_alexey};",
+                        self.sql_engine)
+                    d_id_contig_alexey_to_contig_name = ETLAlexeyGenome.swap_dict(
+                        df_contigs.set_index('id')['contig_id'].to_dict())
+                    #genome = MSGenome.from_fasta(_scan_data['path_faa'])
+                    gff_records = _read_gff_features(_scan_data['path_gff'])
+
+                    from collections import Counter
+                    counter = Counter()
+                    for gff_record in gff_records:
+                        counter.update([gff_record.feature_type])
+                    gff_feature_types = dict(counter)
+
+                    try:
+                        if 'DNA' in gff_feature_types:
+                            self.process_contigs(gff_records, assembly, d_id_contig_alexey_to_contig_name)
+                        else:
+                            self.process_contig_no_gff(assembly, d_id_contig_alexey_to_contig_name)
+                    except Exception as ex:
+                        print(f'I failed to parse this: {genome_name}')
+                        raise ValueError(ex)
+                else:
+                    print(genome_name)
+
+    def process_contigset(self, assembly, contigset_name, alexey_id):
+        hash_contigset = assembly.hash_value
+        if hash_contigset not in self.cdm_contigsets:
+            self.cdm_contigsets[hash_contigset] = {
+                'n_contigs': len(assembly.features),
+                'length': ETLAlexeyGenome._a_sz(assembly),
+                'alexey_db_id': [alexey_id]
+            }
+        else:
+            self.cdm_contigsets[hash_contigset]['alexey_db_id'].append(alexey_id)
+
+        self.cdm_name_contigset.add((hash_contigset, 'contigset', contigset_name, 'alexey_db'))
+
+    def process_contig_no_gff(self, assembly, contig_alias_to_alexey_id):
+        """
+        process contigs without gff
+
+        The alexey dict maps the alternative alias to the surrogate key from the alexey database
+        :param gff_records:
+        :param assembly:
+        :param contig_alias_to_alexey_id:
+        :return:
+        """
+        scaffold_alias_to_contig_id = {}
+
+        hash_contigset = assembly.hash_value
+
+        # iterate all records look for DNA type feature and extract Alias Name and the original feature_id
+        #  from the Note attr
+        for feature_contig in assembly.features:
+            contig_id = feature_contig.id
+            cdm_contig = CDMContig(hash_contigset, feature_contig.seq)
+            pk = (cdm_contig.hash_contig, cdm_contig.contig_set_id)
+
+            if pk not in self.data_contigs:
+                self.data_contigs[pk] = cdm_contig
+                self.data_contigs[pk].alexey_id = set()
+                self.data_contigs[pk].names = set()
+
+            self.data_contigs[pk].alexey_id.add(contig_alias_to_alexey_id[contig_id])
+            self.data_contigs[pk].names.add((
+                cdm_contig.hash_contig, cdm_contig.contig_set_id,
+                'contig', contig_id, 'enigma', 'from _contig.fasta'))
+
+    @staticmethod
+    def get_feature_contig(gff_record, assembly):
+        contig_id = gff_record.contig_id
+        if contig_id in assembly.features:
+            return assembly.features.get_by_id(contig_id)
+        else:
+            contig_id = gff_record.attr['Note'].split(',')[0][:-1]
+            if contig_id in assembly.features:
+                return assembly.features.get_by_id(contig_id)
+            else:
+                contig_id = gff_record.attr['Note'].split(" ")[0]
+                return assembly.features.get_by_id(contig_id)
+
+    def process_contigs(self, gff_records, assembly, contig_alias_to_alexey_id):
+        """
+        process contigs we need both the assembly object and the gff records
+        the gff records contain the alternative aliases given by KBase Prokka while
+        the assembly object has the contig sequence and the original feature_id
+
+        The alexey dict maps the alternative alias to the surrogate key from the alexey database
+        :param gff_records:
+        :param assembly:
+        :param contig_alias_to_alexey_id:
+        :return:
+        """
+        scaffold_alias_to_contig_id = {}
+
+        hash_contigset = assembly.hash_value
+
+        # iterate all records look for DNA type feature and extract Alias Name and the original feature_id
+        #  from the Note attr
+        for gff_record in gff_records:
+            if gff_record.feature_type == 'DNA':
+
+                alias = gff_record.attr.get('Alias')
+                gff_contig_id = gff_record.contig_id
+                alexey_db_id = contig_alias_to_alexey_id.get(alias)
+                if alexey_db_id is None:
+                    alexey_db_id = contig_alias_to_alexey_id.get(gff_contig_id)
+
+                if alias not in scaffold_alias_to_contig_id:
+                    feature_contig = ETLAlexeyGenome.get_feature_contig(gff_record, assembly)
+
+                    cdm_contig = CDMContig(hash_contigset, feature_contig.seq)
+                    pk = (cdm_contig.hash_contig, cdm_contig.contig_set_id)
+
+                    if pk not in self.data_contigs:
+                        self.data_contigs[pk] = cdm_contig
+                        self.data_contigs[pk].alexey_id = set()
+                        self.data_contigs[pk].names = set()
+
+                    self.data_contigs[pk].alexey_id.add(alexey_db_id)
+
+                    self.data_contigs[pk].names.add((
+                        cdm_contig.hash_contig, cdm_contig.contig_set_id,
+                        'contig', alias, 'enigma', 'from _Prodigal.gff'
+                    ))
+                    self.data_contigs[pk].names.add((
+                        cdm_contig.hash_contig, cdm_contig.contig_set_id,
+                        'contig', feature_contig.id, 'enigma', 'from _contigs.fasta'
+                    ))
+                    if gff_record.attr['Name'] != alias:
+                        self.data_contigs[pk].names.add((
+                            cdm_contig.hash_contig, cdm_contig.contig_set_id,
+                            'contig',
+                            gff_record.attr['Name'],
+                            'enigma',
+                            'from _contigs.fasta'
+                        ))
+                else:
+                    raise ValueError('!')
+
+    @staticmethod
+    def build_feature_name(genome_id, gff_record):
+        """
+        if 'Name' in gff_record.attr:
+            attr_name = gff_record.attr.get('Name')
+            return f"{genome_id}_{gff_record.feature_type}_{attr_name}"
+        elif 'ID' in gff_record.attr:
+            attr_id = gff_record.attr.get('ID')
+            return f"{genome_id}_{gff_record.feature_type}_{attr_id}"
+        else:
+        """
+        return f"{genome_id}_{gff_record.contig_id}_{gff_record.feature_type}_{gff_record.start}_{gff_record.end}_{gff_record.strand}"
+
+    @staticmethod
+    def find_feature(genome, gff_record):
+        if 'Name' in gff_record.attr:
+            _name = gff_record.attr.get('Name')
+            if _name in genome.features:
+                return genome.features.get_by_id(_name)
+            else:
+                _id = gff_record.attr.get('ID')
+                if _id in genome.features:
+                    return genome.features.get_by_id(_id)
+                else:
+                    return genome.features.get_by_id(_id.split('.')[0])
+
+        _id = gff_record.attr.get('ID')
+        if _id in genome.features:
+            return genome.features.get_by_id(_id)
+
+        search_id = '_'.join(gff_record.contig_id.split('_')[:-1]) + '_' + gff_record.attr.get('ID')
+        if search_id in genome.features:
+            return genome.features.get_by_id(search_id)
+
+        search_id = gff_record.contig_id[:-1] + gff_record.attr.get('ID')
+        return genome.features.get_by_id(search_id)
+
+    def process_features(self, genome_id, gff_records, assembly, genome, protocol_id):
+        from modelseedpy_ext.re.hash_seq import HashSeq
+        feature_function = {}  # product
+
+        def _add_name(cdm_feature, name):
+            if name:
+                self.cdm_name_feature.add((cdm_feature.id, 'feature', name, 'enigma', 'from _Prodigal.gff'))
+
+        hash_contigset = assembly.hash_value
+        assembly_to_gff = MapAssemblyToGff(assembly, gff_records)
+
+        for i in range(len(gff_records)):
+            gff_record = gff_records[i]
+
+            feature_contig = assembly_to_gff.gff_index_to_contig[i]
+            hash_contig = assembly_to_gff.contig_to_hash_seq[feature_contig.id]
+
+            feature_id = ETLAlexeyGenome.build_feature_name(genome_id, gff_record)
+            cdm_feature = CDMFeature(feature_id, hash_contig, hash_contigset,
+                                     gff_record.start, gff_record.end, gff_record.strand,
+                                     gff_record.feature_type, gff_record.source, gff_record.phase,
+                                     protocol_id,
+                                     gff_record.attr)
+
+            _add_name(cdm_feature, gff_record.attr.get('Name'))
+            _add_name(cdm_feature, gff_record.attr.get('gene_synonym'))
+            annotation = gff_record.attr.get('product')
+            if annotation:
+                self.cdm_feature_annotation[cdm_feature.id] = annotation
+
+            if cdm_feature.id not in self.data_features:
+                self.data_features[cdm_feature.id] = cdm_feature
+            else:
+                raise ValueError(f'id error {cdm_feature.id}')
+
+            if gff_record.feature_type == 'CDS':
+                feature_protein = ETLAlexeyGenome.find_feature(genome, gff_record)
+                _add_name(cdm_feature, feature_protein.id)
+
+                cdm_protein = CDMProtein(feature_protein.seq)
+                # map encoded protein
+                self.data_feature_to_protein[(cdm_feature.id, cdm_protein.hash)] = cdm_protein.stop_codon
+
+            # print(feature_id, name_to_contig_hash[gff_record.contig_id])
+            # print(gff_record.attr)
+
+    def export_data_contigs(self):
+        names = ["hash_contig", "hash_contigset", "length", "gc_content", "base_count"]
+        _data = {k: [] for k in names}
+
+        for o in tqdm(self.data_contigs.values()):
+            _data["hash_contigset"].append(o.contig_set_id)
+            _data["hash_contig"].append(o.hash_contig)
+            _data["length"].append(len(o.seq))
+            _data["gc_content"].append(o.gc)
+            _data["base_count"].append(o.base_count)
+
+        table = pa.Table.from_arrays([pa.array(_data[k]) for k in names], names=names)
+        return table
+
+    def export_data_name_contigs(self):
+        _table_name_names = ["id_entity", "hash_contigset", "name_table", "name", "source", "description"]
+        _table_name_data = {k: [] for k in _table_name_names}
+        names = {}
+        for o in tqdm(self.data_contigs.values()):
+            for hash_contig, hash_contigset, name_table, name, source, description in o.names:
+                names[(hash_contig, name, source)] = (
+                    hash_contig, hash_contigset, name_table, name, source, description)
+        for hash_contig, hash_contigset, name_table, name, source, description in names.values():
+            _table_name_data['id_entity'].append(hash_contig)
+            _table_name_data['hash_contigset'].append(hash_contigset)
+            _table_name_data['name_table'].append(name_table)
+            _table_name_data['name'].append(name)
+            _table_name_data['source'].append(source)
+            _table_name_data['description'].append(description)
+        table = pa.Table.from_arrays([pa.array(_table_name_data[k]) for k in _table_name_names],
+                                     names=_table_name_names)
+        return table
+
+
+class EtlAssemblyAndGenome:
+
+    def __init__(self, fn_get_feature_contig=None, fn_get_start_end_strand_s_attr=None, fn_get_feature_id=None):
+
+        self.data_contig_set = []
+        self.data_contigs = []
+
+        self.contig_name_to_hash = {}
+
+        self.data_proteins = {}
+        self.data_features = {}
+        self.data_feature_to_protein = {}
+
+        self.names = set()
+
+        self._fn_get_feature_contig = fn_get_feature_contig
+        self._fn_get_start_end_strand_s_attr = fn_get_start_end_strand_s_attr
+        if self._fn_get_feature_contig is None:
+            self._fn_get_feature_contig = _get_feature_contig
+        if self._fn_get_start_end_strand_s_attr is None:
+            self._fn_get_start_end_strand_s_attr = _get_start_end_strand_s_attr
+        self._fn_get_feature_id = _get_feature_id
+        if fn_get_feature_id:
+            self._fn_get_feature_id = fn_get_feature_id
+
+    def etl_assembly(self, g_assembly, contig_id_name_source='ncbi'):
+        contig_set_hash = g_assembly.hash_value
+
+        cdm_contig_set = CDMContigSet(contig_set_hash)
+
+        for feature in g_assembly.features:
+            cdm_contig = CDMContig(contig_set_hash, feature.seq)
+            if contig_id_name_source:
+                cdm_contig.names.append((contig_id_name_source, feature.id))
+            cdm_contig_set.contigs.append(cdm_contig)
+            self.data_contigs.append(cdm_contig)
+
+            for _t, _n in cdm_contig.names:
+                self.names.add((cdm_contig.hash_contig, _t, _n))
+                if _n not in self.contig_name_to_hash:
+                    self.contig_name_to_hash[_n] = cdm_contig.hash_contig
+                elif self.contig_name_to_hash[_n] != cdm_contig.hash_contig:
+                    raise ValueError("dup")
+
+        self.data_contig_set.append(cdm_contig_set)
+
+        return contig_set_hash
+
+    def features(self, genome: MSGenome, gff):
+        for f in genome.features:
+            yield f
+
+    def etl_genome(self, genome: MSGenome, hash_contigset, gff=None):
+
+        for f in self.features():
+            feature_id = self._fn_get_feature_id(f)
+            _contig_name = self._fn_get_feature_contig(f)
+            _contig_name_hash = self.contig_name_to_hash[_contig_name]
+
+            start, end, strand, attr = self._fn_get_start_end_strand_s_attr(f, gff)
+
+            cdm_feature = CDMFeature(
+                feature_id,
+                _contig_name_hash,
+                hash_contigset,
+                start,
+                end,
+                strand,
+                attr,
+            )
+            cdm_protein = CDMProtein(f.seq)
+            self.data_feature_to_protein[cdm_feature.id] = (
+                cdm_protein.hash,
+                cdm_protein.stop_codon,
+            )
+            if cdm_feature.id not in self.data_features:
+                self.data_features[cdm_feature.id] = cdm_feature
+            else:
+                raise ValueError("dup feature id !~!")
+            if cdm_protein.hash not in self.data_proteins:
+                self.data_proteins[cdm_protein.hash] = cdm_protein
+
+    def export_protein(self):
+        names = ["hash_protein_sequence", "description", "length", "sequence", "evidence_for_existence"]
+        _data = {k: [] for k in names}
+        for i in tqdm(self.data_proteins):
+            o = self.data_proteins[i]
+            _data["hash_protein_sequence"].append(o.hash)
+            _data["length"].append(len(o.seq))
+            _data["sequence"].append(o.seq)
+            _data["description"].append(None)
+            _data["evidence_for_existence"].append(None)
+
+        pa_table = pa.Table.from_arrays(
+            [pa.array(_data[k]) for k in names], names=names
+        )
+        return pa_table
+
+    def export_names(self):
+        _label_names = ["entity_id", "type", "name"]
+        _data_names = {k: [] for k in _label_names}
+        for _hash, _type, _value in tqdm(self.names):
+            _data_names["entity_id"].append(_hash)
+            _data_names["type"].append(_type)
+            _data_names["name"].append(_value)
+
+        table = pa.Table.from_arrays(
+            [pa.array(_data_names[k]) for k in _label_names], names=_label_names
+        )
+        return table
+
+    def export_data_contigset(self):
+        names = ["hash_contigset", "n_contigs"]
+        _data = {k: [] for k in names}
+        for o in tqdm(self.data_contig_set):
+            _data["hash_contigset"].append(o.hash_contigset)
+            _data["n_contigs"].append(len(o.contigs))
+
+        table = pa.Table.from_arrays([pa.array(_data[k]) for k in names], names=names)
+        return table
+
+    def export_data_contigs(self):
+        names = ["hash_contigset", "hash_contig", "length", "gc_content", "base_count"]
+        _data = {k: [] for k in names}
+
+        _label_names = ["entity_id", "type", "name"]
+        _data_names = {k: [] for k in _label_names}
+        for o in tqdm(self.data_contigs):
+            _data["hash_contigset"].append(o.contig_set_id)
+            _data["hash_contig"].append(o.hash_contig)
+            _data["length"].append(len(o.seq))
+            _data["gc_content"].append(o.gc)
+            _data["base_count"].append(o.base_count)
+
+        table = pa.Table.from_arrays([pa.array(_data[k]) for k in names], names=names)
+        return table
+
+    def export_data_feature(self):
+        names = [
+            "feature_id",
+            "hash_contig",
+            "hash_contigset",
+            "hash",
+            "source",
+            "type",
+            "start",
+            "end",
+            "strand",
+            "cds_phase",
+            "attributes",
+        ]
+        _data = {k: [] for k in names}
+        for i in tqdm(self.data_features):
+            o = self.data_features[i]
+            _data["feature_id"].append(o.id)
+            _data["hash_contig"].append(o.hash_contig)
+            _data["hash_contigset"].append(o.hash_contigset)
+            _data["hash"].append(None)
+            _data["source"].append(o.source)
+            _data["type"].append("CDS")
+            _data["start"].append(o.start)
+            _data["end"].append(o.end)
+            _data["strand"].append(o.strand)
+            _data["cds_phase"].append(None)
+            _data["attributes"].append(o.attributes)
+
+        table = pa.Table.from_arrays([pa.array(_data[k]) for k in names], names=names)
+        return table
+
+    def export_data_feature_x_protein(self):
+        names = ["feature_id", "hash_protein_sequence", "has_stop_codon"]
+        _data = {k: [] for k in names}
+        for feature_id in tqdm(self.data_feature_to_protein):
+            protein_hash, has_stop_codon = self.data_feature_to_protein[feature_id]
+            _data["feature_id"].append(feature_id)
+            _data["hash_protein_sequence"].append(protein_hash)
+            _data["has_stop_codon"].append(has_stop_codon)
+
+        table = pa.Table.from_arrays([pa.array(_data[k]) for k in names], names=names)
+        return table
+
+
+class TransformToParquet:
+
+    def __init__(self):
+        self.data_contigsets = {}
+        self.data_contigs = {}
+        self.data_contigset_x_contig = set()
+        self.data_features = {}
+        self.data_proteins = {}
+        self.data_feature_x_protein = set()
+
+    def transform(self, list_etl: list):
+
+        for etl in list_etl:
+            cdm_contigset = etl._cdm_contigset
+            if cdm_contigset.hash_contigset not in self.data_contigsets:
+                self.data_contigsets[cdm_contigset.hash_contigset] = cdm_contigset
+            else:
+                print('!')
+            for h in etl._data_cdm_contig:
+                cdm_contig = etl._data_cdm_contig[h]
+                if cdm_contig.hash_contig not in self.data_contigs:
+                    self.data_contigs[cdm_contig.hash_contig] = cdm_contig
+                else:
+                    print('!')
+            for t in etl._data_cdm_contigset_x_contig:
+                if t not in self.data_contigset_x_contig:
+                    self.data_contigset_x_contig.add(t)
+                else:
+                    print('!')
+
+            for f_id in etl._data_cdm_feature:
+                cdm_feature = etl._data_cdm_feature[f_id]
+                if f_id not in self.data_features:
+                    self.data_features[f_id] = cdm_feature
+                else:
+                    raise ValueError('feature')
+
+            for p_id in etl._data_cdm_protein:
+                cdm_protein = etl._data_cdm_protein[p_id]
+                if p_id not in self.data_features:
+                    self.data_proteins[p_id] = cdm_protein
+                else:
+                    raise ValueError('protein')
+
+            for t in etl._data_cdm_feature_x_protein:
+                if t not in self.data_feature_x_protein:
+                    self.data_feature_x_protein.add(t)
+
+    @staticmethod
+    def export_contigs(list_contig: list):
+        names = ["hash_contig", "length", "gc_content", "base_count"]
+        data = {k: [] for k in names}
+
+        for o in tqdm(list_contig):
+            data["hash_contig"].append(o.hash_contig)
+            data["length"].append(o.length)
+            data["gc_content"].append(o.gc)
+            data["base_count"].append(o.base_count)
+
+        table = pa.Table.from_arrays(
+            [pa.array(data[k]) for k in names],
+            names=names)
+
+        return table
+
+    @staticmethod
+    def export_contigsets(list_contigset: list):
+        names = ["hash_contigset", "n_contigs", "size"]
+        data = {k: [] for k in names}
+
+        for o in tqdm(list_contigset):
+            data["hash_contigset"].append(o.hash_contigset)
+            data["n_contigs"].append(len(o.contigs))
+            data["size"].append(o.size)
+
+        table = pa.Table.from_arrays(
+            [pa.array(data[k]) for k in names],
+            names=names)
+
+        return table
+
+    @staticmethod
+    def export_contigset_x_contigs(list_contigset: list):
+        names = ["hash_contigset_x_contig", "hash_contigset", "hash_contig", "contig_index"]
+        data = {k: [] for k in names}
+
+        for hash_contigset_x_contig, hash_contigset, hash_contig, contig_index in tqdm(list_contigset):
+            data["hash_contigset_x_contig"].append(hash_contigset_x_contig)
+            data["hash_contigset"].append(hash_contigset)
+            data["hash_contig"].append(hash_contig)
+            data["contig_index"].append(contig_index)
+
+        table = pa.Table.from_arrays(
+            [pa.array(data[k]) for k in names],
+            names=names)
+
+        return table
+
+    def export_features(self, list_feature):
+        names = ["feature_id", "hash_contigset_x_contig",
+                 "source", "protocol_id", "type",
+                 "start", "end", "strand", "cds_phase"]
+        data = {k: [] for k in names}
+
+        for o in tqdm(list_feature):
+            data["feature_id"].append(o.id)
+            data["hash_contigset_x_contig"].append(o.contigset_x_contig_id)
+            data["source"].append(o.source)
+            data["protocol_id"].append(o.protocol)
+            data["type"].append(o.type)
+            data["start"].append(o.start)
+            data["end"].append(o.end)
+            data["strand"].append(o.strand)
+            data["cds_phase"].append(o.cds_phase)
+
+        table = pa.Table.from_arrays(
+            [pa.array(data[k]) for k in names],
+            names=names)
+
+        return table
+
+    def export_proteins(self, list_protein):
+        names = ["hash_protein_sequence", "description",
+                 "length", "sequence"]
+        data = {k: [] for k in names}
+
+        for o in tqdm(list_protein):
+            data["hash_protein_sequence"].append(o.hash)
+            data["description"].append(None)
+            data["length"].append(len(o.seq))
+            data["sequence"].append(o.seq)
+
+        table = pa.Table.from_arrays(
+            [pa.array(data[k]) for k in names],
+            names=names)
+
+        return table
+
+    def export_feature_x_proteins(self, list_feature_to_proteins):
+        names = ["feature_id", "hash_protein_sequence", "has_stop_codon"]
+        data = {k: [] for k in names}
+
+        for feature_id, hash_protein in tqdm(list_feature_to_proteins):
+            data["feature_id"].append(feature_id)
+            data["hash_protein_sequence"].append(hash_protein)
+            data["has_stop_codon"].append(True)
+
+        table = pa.Table.from_arrays(
+            [pa.array(data[k]) for k in names],
+            names=names)
+
+        return table
\ No newline at end of file
diff --git a/cdm_data_loader_utils/io/__init__.py b/cdm_data_loader_utils/io/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/cdm_data_loader_utils/io/genome.py b/cdm_data_loader_utils/io/genome.py
new file mode 100755
index 0000000..b3a8dfe
--- /dev/null
+++ b/cdm_data_loader_utils/io/genome.py
@@ -0,0 +1,56 @@
+from Bio import SeqIO
+from cdm_data_loader_utils.core.genome import CDMContig
+
+
+def read_gbff_records_from_file(filename: str):
+    if filename.endswith(".gbff"):
+        with open(filename, 'r') as fh:
+            return read_gbff_records(fh)
+    elif filename.endswith(".gz"):
+        import gzip
+        from io import StringIO
+        with gzip.open(filename, 'rb') as fh:
+            return read_gbff_records(StringIO(fh.read().decode('utf-8')))
+
+
+def read_gbff_records(handler):
+    gbff_records = []
+    for record in SeqIO.parse(handler, "gb"):
+        gbff_records.append(record)
+    return gbff_records
+
+
+class ContigFeaturePairsBuilder:
+
+    @staticmethod
+    def get_feature_contig_id(f):
+        ids = {x[0] for x in f.location}
+        if len(ids) == 1:
+            return list(ids)[0]
+        else:
+            raise ValueError('unable to fetch contig id')
+
+    @staticmethod
+    def from_kbase(genome_assembly, genome_features):
+        contig_to_features = {}
+        for contig in genome_assembly.features:
+            contig_to_features[contig.id] = []
+        for feature in genome_features.features:
+            contig_id = ContigFeaturePairsBuilder.get_feature_contig_id(feature)
+            contig_to_features[contig_id].append(feature)
+        contig_feature_pairs = []
+        for contig_id, features in contig_to_features.items():
+            contig = genome_assembly.features.get_by_id(contig_id)
+            cdm_contig = CDMContig(str(contig.seq))
+            contig_feature_pairs.append((cdm_contig, features))
+
+        return contig_feature_pairs
+
+    @staticmethod
+    def from_gbff(filename):
+        gbff_records = read_gbff_records_from_file(filename)
+        contig_feature_pairs = []
+        for rec in gbff_records:
+            cdm_contig = CDMContig(str(rec.seq))
+            contig_feature_pairs.append((cdm_contig, [f for f in rec.features]))
+        return contig_feature_pairs
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100755
index 0000000..8e0e52d
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,27 @@
+[build-system]
+requires = [
+  'setuptools>=40.6.0',
+  'wheel'
+]
+build-backend = "setuptools.build_meta"
+
+[tool.black]
+line-length = 88
+python-version = ['py38']
+include = '\.pyi?$'
+exclude = '''
+(
+  /(
+      \.eggs         # exclude a few common directories in the
+    | \.git          # root of the project
+    | \.hg
+    | \.mypy_cache
+    | \.tox
+    | \.venv
+    | _build
+    | buck-out
+    | build
+    | dist
+  )/
+)
+'''
diff --git a/requirements.txt b/requirements.txt
index f0918d2..0b617cd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,10 @@
+tqdm
 biopython
+modelseedpy==0.4.2
+sqlalchemy
+pyarrow
 pandas
+polars
 pytest
 coverage
 pytest-coverage
diff --git a/setup.py b/setup.py
new file mode 100755
index 0000000..aa272e3
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+
+from setuptools import setup, find_packages
+
+with open("README.md") as f:
+    _readme = f.read()
+
+with open("LICENSE") as f:
+    _license = f.read()
+
+setup(
+    name="CDM-Data-Loader-Utils",
+    version="0.0.1",
+    description="CDM-Data-Loader-Utils",
+    long_description_content_type="text/x-rst",
+    long_description=_readme,
+    author="KBase developers",
+    author_email="KBase developers",
+    url="https://github.com/kbase/cdm-data-loader-utils",
+    license=_license,
+    packages=find_packages(exclude=("docs")),
+    package_data={
+
+    },
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Topic :: Scientific/Engineering :: Bio-Informatics",
+        "Intended Audience :: Science/Research",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Natural Language :: English",
+    ],
+    install_requires=[
+        "pandas >= 2.2.2",
+    ],
+    tests_require=[
+        "pytest",
+    ],
+    project_urls={
+        "Documentation": "",
+        "Issues": "",
+    },
+)
diff --git a/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.fna.gz b/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.fna.gz
new file mode 100755
index 0000000..e21b3e7
Binary files /dev/null and b/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.fna.gz differ
diff --git a/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gbff.gz b/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gbff.gz
new file mode 100755
index 0000000..833ee39
Binary files /dev/null and b/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gbff.gz differ
diff --git a/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gff.gz b/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gff.gz
new file mode 100755
index 0000000..7d1c491
Binary files /dev/null and b/tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gff.gz differ
diff --git a/tests/data/genome/GCF_000005845.2_ASM584v2_protein.faa.gz b/tests/data/genome/GCF_000005845.2_ASM584v2_protein.faa.gz
new file mode 100755
index 0000000..3a50efd
Binary files /dev/null and b/tests/data/genome/GCF_000005845.2_ASM584v2_protein.faa.gz differ
diff --git a/tests/data/genome_paths_file/valid.json b/tests/data/genome_paths_file/valid.json
old mode 100644
new mode 100755
diff --git a/tests/src/parsers/test_etl_genome.py b/tests/src/parsers/test_etl_genome.py
new file mode 100755
index 0000000..0e3150f
--- /dev/null
+++ b/tests/src/parsers/test_etl_genome.py
@@ -0,0 +1,13 @@
+from cdm_data_loader_utils.io.genome import ContigFeaturePairsBuilder
+from cdm_data_loader_utils.etl_genome import ETLGbffGenome
+
+
+def test_etl_genome_gbff():
+    pairs = ContigFeaturePairsBuilder.from_gbff('tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gbff.gz')
+    etl = ETLGbffGenome()
+    etl.etl('my_genome', 'some_protocol', pairs)
+
+    assert etl._cdm_contigset is not None
+    assert len(etl._data_cdm_contig) == 1
+    assert len(etl._data_cdm_contigset_x_contig) == 1
+    assert len(etl._data_cdm_protein) == 4257
diff --git a/tests/src/parsers/test_io_genome.py b/tests/src/parsers/test_io_genome.py
new file mode 100755
index 0000000..698a063
--- /dev/null
+++ b/tests/src/parsers/test_io_genome.py
@@ -0,0 +1,7 @@
+from cdm_data_loader_utils.io.genome import read_gbff_records_from_file
+
+
+def test_read_gbff_records_from_file():
+    records = read_gbff_records_from_file('tests/data/genome/GCF_000005845.2_ASM584v2_genomic.gbff.gz')
+
+    assert len(records) == 1