Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
5d0cf65
created stubs UploadedFileSource and RepositoryIdSource
BjornFJohansson Nov 7, 2025
4ea5941
next step
BjornFJohansson Nov 7, 2025
27307e3
next step2
BjornFJohansson Nov 7, 2025
7164964
added rudimentary source/history for local files read in reader/parse…
BjornFJohansson Nov 7, 2025
b6c5fb4
fixed fields in model
BjornFJohansson Nov 7, 2025
1e817b4
enable higher opencloning-linkml versions
manulera Nov 17, 2025
5a6c7d3
support and test edge case from #488
manulera Nov 17, 2025
5028df8
starting to work on better serialization
manulera Nov 25, 2025
c22d42d
simplify serialization to avoid having to use kwargs, instead define …
manulera Nov 25, 2025
7d89f65
fix tests (happens because TARGET_MODEL was not behaving properly in …
manulera Nov 25, 2025
4404482
update notebook output
manulera Nov 25, 2025
e5d5270
add some repositories
manulera Nov 25, 2025
b47b060
add the rest of the models
manulera Nov 25, 2025
242bc4d
add and test oligonucleotide hybridization
manulera Nov 25, 2025
2dfd703
update to latest model version
manulera Nov 25, 2025
e95e860
comment Location out
manulera Nov 25, 2025
ff396f5
fix existing genbank test and adapt to https://github.com/OpenCloning…
manulera Dec 5, 2025
d3dd8a9
update linkml model
manulera Dec 5, 2025
edb1521
update linkml model and drop repository_name field
manulera Dec 6, 2025
6a29aa9
add location serializer to GenomeCoordinatesSource
manulera Dec 6, 2025
a2feb17
update opencloning-linkml to released version
manulera Dec 6, 2025
6f25090
add utility to get coordinates in the format used for the NCBI API
manulera Dec 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions docs/notebooks/history.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -186,13 +186,13 @@
"╙── product (Dseqrecord(-18))\n",
" └─╼ LigationSource\n",
" ├─╼ c (Dseqrecord(-7))\n",
" │ └─╼ Source\n",
" │ └─╼ a (Dseqrecord(-18)) ╾ Source, Source\n",
" │ └─╼ RestrictionEnzymeDigestionSource\n",
" │ └─╼ a (Dseqrecord(-18)) ╾ RestrictionEnzymeDigestionSource, RestrictionEnzymeDigestionSource\n",
" ├─╼ d (Dseqrecord(-12))\n",
" │ └─╼ Source\n",
" │ └─╼ RestrictionEnzymeDigestionSource\n",
" │ └─╼ ...\n",
" └─╼ e (Dseqrecord(-7))\n",
" └─╼ Source\n",
" └─╼ RestrictionEnzymeDigestionSource\n",
" └─╼ ...\n"
]
}
Expand Down Expand Up @@ -354,8 +354,8 @@
" └─╼ CreLoxRecombinationSource\n",
" └─╼ integration_product (Dseqrecord(-84))\n",
" └─╼ CreLoxRecombinationSource\n",
" ├─╼ a (Dseqrecord(-45))\n",
" └─╼ b (Dseqrecord(o39))\n"
" ├─╼ genome (Dseqrecord(-45))\n",
" └─╼ plasmid (Dseqrecord(o39))\n"
]
}
],
Expand Down
482 changes: 132 additions & 350 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ scipy = [
]
seguid = ">=0.0.5"
regex = "^2024.11.6"
opencloning-linkml = "0.4.5"
opencloning-linkml = "^0.4.9"
[tool.poetry.extras]
clipboard = ["pyperclip"]
download = ["pyparsing", "requests"]
Expand Down
2 changes: 2 additions & 0 deletions src/pydna/afile.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>fn
gatc
5 changes: 3 additions & 2 deletions src/pydna/assembly2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2036,7 +2036,7 @@ def _recast_sources(
"""
for prod in products:
prod.source = source_cls(
**prod.source.model_dump(),
**prod.source.to_unserialized_dict(),
**extra_fields,
)
return products
Expand Down Expand Up @@ -2805,7 +2805,8 @@ def crispr_integration(
# The second element of product.source.input is conventionally the insert/repair fragment
# The other two (first and third) are the two bits of the genome
repair_start = _location_boundaries(product.source.input[0].right_location)[0]
repair_end = _location_boundaries(product.source.input[2].left_location)[1]
# Here we do +1 because the position of the cut marks the boundary (e.g. 0:10, 10:20 if a cut is at pos 10)
repair_end = _location_boundaries(product.source.input[2].left_location)[1] + 1
repair_location = create_location(repair_start, repair_end, len(genome))
some_cuts_inside_repair = []
all_cuts_inside_repair = []
Expand Down
38 changes: 24 additions & 14 deletions src/pydna/genbank.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,17 @@
`pydna.ini` file. See the documentation of :func:`pydna.open_config_folder`"""

# from pydna.utils import memorize as _memorize
from pydna.opencloning_models import NCBISequenceSource
from pydna.genbankrecord import GenbankRecord as _GenbankRecord
from pydna.readers import read as _read

from Bio import Entrez as _Entrez
from Bio.SeqFeature import SimpleLocation

from typing import Literal as _Literal, Optional as _Optional
import re as _re
import os as _os

# import logging as _logging

# _module_logger = _logging.getLogger("pydna." + __name__)


# TODO http://httpbin.org/ use for testing?


class Genbank:
"""Class to facilitate download from genbank. It is easier and
Expand Down Expand Up @@ -179,12 +175,29 @@ def nucleotide(

# _module_logger.info("text[:160] %s", text[:160])

return _GenbankRecord(
_read(text), item=item, start=seq_start, stop=seq_stop, strand=strand
result = _read(text)
# TODO: Address this for cases where only one is defined
if seq_start is not None and seq_stop is not None:
location = SimpleLocation(
int(seq_start) - 1, int(seq_stop), -1 if strand == 2 else strand
)
else:
location = None

result.source = NCBISequenceSource(
repository_id=item,
coordinates=location,
)
return result

# return _GenbankRecord(
# _read(text), item=item, start=seq_start, stop=seq_stop, strand=strand
# )


def genbank(accession: str = "CS570233.1", *args, **kwargs) -> _GenbankRecord:
def genbank(
accession: str = "CS570233.1", *args, email=None, **kwargs
) -> _GenbankRecord:
"""
Download a genbank nuclotide record.

Expand Down Expand Up @@ -229,9 +242,6 @@ def genbank(accession: str = "CS570233.1", *args, **kwargs) -> _GenbankRecord:
//

"""
email = _os.getenv("pydna_email")
# _module_logger.info("#### genbank function called ####")
# _module_logger.info("email %s", email)
# _module_logger.info("accession %s", email)
email = email or _os.getenv("pydna_email")
gb = Genbank(email)
return gb.nucleotide(accession, *args, **kwargs)
124 changes: 124 additions & 0 deletions src/pydna/oligonucleotide_hybridization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# -*- coding: utf-8 -*-
"""
This module contains the functions for oligonucleotide hybridization.
"""

from pydna.common_sub_strings import common_sub_strings
from Bio.Seq import reverse_complement
from pydna.primer import Primer
from pydna.dseqrecord import Dseqrecord
from pydna.dseq import Dseq
from pydna.opencloning_models import OligoHybridizationSource, SourceInput


def oligonucleotide_hybridization_overhangs(
fwd_oligo_seq: str, rvs_oligo_seq: str, minimal_annealing: int
) -> list[int]:
"""
Returns possible overhangs between two oligos given a minimal annealing length, and
returns an error if mismatches are found.

see https://github.com/manulera/OpenCloning_backend/issues/302 for notation

>>> from pydna.oligonucleotide_hybridization import oligonucleotide_hybridization_overhangs
>>> oligonucleotide_hybridization_overhangs("ATGGC", "GCCAT", 3)
[0]
>>> oligonucleotide_hybridization_overhangs("aATGGC", "GCCAT", 5)
[-1]
>>> oligonucleotide_hybridization_overhangs("ATGGC", "GCCATa", 5)
[1]
>>> oligonucleotide_hybridization_overhangs("ATGGC", "GCCATaaGCCAT", 5)
[0, 7]

If the minimal annealing length is longer than the length of the shortest oligo, it returns an empty list.

>>> oligonucleotide_hybridization_overhangs("ATGGC", "GCCATaaGCCAT", 100)
[]

If it's possible to anneal for ``minimal_annealing`` length, but with mismatches, it raises an error.

>>> oligonucleotide_hybridization_overhangs("cATGGC", "GCCATa", 5)
Traceback (most recent call last):
...
ValueError: The oligonucleotides can anneal with mismatches
"""
matches = common_sub_strings(
fwd_oligo_seq.lower(),
reverse_complement(rvs_oligo_seq.lower()),
minimal_annealing,
)

for pos_fwd, pos_rvs, length in matches:

if (pos_fwd != 0 and pos_rvs != 0) or (
pos_fwd + length < len(fwd_oligo_seq)
and pos_rvs + length < len(rvs_oligo_seq)
):
raise ValueError("The oligonucleotides can anneal with mismatches")

# Return possible overhangs
return [pos_rvs - pos_fwd for pos_fwd, pos_rvs, length in matches]


def oligonucleotide_hybridization(
fwd_primer: Primer, rvs_primer: Primer, minimal_annealing: int
) -> list[Dseqrecord]:
"""
Returns a list of Dseqrecord objects representing the hybridization of two primers.

>>> from pydna.primer import Primer
>>> from pydna.oligonucleotide_hybridization import oligonucleotide_hybridization
>>> fwd_primer = Primer("ATGGC")
>>> rvs_primer = Primer("GCCA")
>>> oligonucleotide_hybridization(fwd_primer, rvs_primer, 3)[0].seq
Dseq(-5)
ATGGC
ACCG

Multiple values can be returned:

>>> rvs_primer2 = Primer("GCCATaaGCCAT")
>>> oligonucleotide_hybridization(fwd_primer, rvs_primer2, 3)[0].seq
Dseq(-12)
ATGGC
TACCGaaTACCG
>>> oligonucleotide_hybridization(fwd_primer, rvs_primer2, 3)[1].seq
Dseq(-12)
ATGGC
TACCGaaTACCG

If no possible overhangs are found, it returns an empty list.

>>> oligonucleotide_hybridization(fwd_primer, rvs_primer, 100)
[]

If there are mismatches given the minimal annealing length, it raises an error.

>>> fwd_primer3 = Primer("cATGGC")
>>> rvs_primer3 = Primer("GCCATa")
>>> oligonucleotide_hybridization(fwd_primer3, rvs_primer3, 5)
Traceback (most recent call last):
...
ValueError: The oligonucleotides can anneal with mismatches
"""
possible_overhangs = oligonucleotide_hybridization_overhangs(
str(fwd_primer.seq), str(rvs_primer.seq), minimal_annealing
)
sources = [
OligoHybridizationSource(
overhang_crick_3prime=pos,
input=[SourceInput(sequence=fwd_primer), SourceInput(sequence=rvs_primer)],
)
for pos in possible_overhangs
]
return [
Dseqrecord(
Dseq(
str(fwd_primer.seq),
str(rvs_primer.seq),
ovhg=source.overhang_crick_3prime,
),
source=source,
)
for source in sources
]
Loading
Loading