diff --git a/src/pydna/__init__.py b/src/pydna/__init__.py index 35be864a..2b6bdda8 100644 --- a/src/pydna/__init__.py +++ b/src/pydna/__init__.py @@ -5,6 +5,7 @@ # license. Please see the LICENSE.txt file that should have been included # as part of this package. + """ :copyright: Copyright 2013-2023 by Björn Johansson. All rights reserved. :license: This code is part of the pydna package, governed by the @@ -142,7 +143,7 @@ # import configparser as _configparser # import tempfile as _tempfile from pydna._pretty import PrettyTable as _PrettyTable - +from Bio.Restriction import FormattedSeq __author__ = "Björn Johansson" __copyright__ = "Copyright 2013 - 2023 Björn Johansson" @@ -396,3 +397,18 @@ def logo(): f = Figlet() message = f.renderText(message) return _pretty_str(message) + + +## Override Bio.Restriction.FormattedSeq._table + + +def _make_FormattedSeq_table() -> bytes: + table = bytearray(256) + upper_to_lower = ord("A") - ord("a") + for c in b"ABCDEFGHIJKLMNOPQRSTUVWXYZ": # Only allow IUPAC letters + table[c] = c # map uppercase to uppercase + table[c - upper_to_lower] = c # map lowercase to uppercase + return bytes(table) + + +FormattedSeq._table = _make_FormattedSeq_table() diff --git a/src/pydna/alphabet.py b/src/pydna/alphabet.py new file mode 100644 index 00000000..6c634d96 --- /dev/null +++ b/src/pydna/alphabet.py @@ -0,0 +1,590 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from collections import namedtuple +import re as _re + +""" +Nucleic acid alphabet used in pydna + +This file serves to define the DNA aplhabet used in pydna. Each symbol usually represents a basepair +(two opposing bases in the two antiparalell DNA strands). The alphabet is defined in six literal strings +in this file. These strings serve as the single source of thruth for the alphabet and +a series of dictionaries and translation tebles are constructed from their content. + +The strings have the following names: + +- un_ambiguous_ds_dna +- ambiguous_ds_dna +- ds_rna +- single_stranded_dna_rna +- mismatched_dna_rna +- loops_dna_rna + +Each string has five lines following this form: + +W 1 +| 2 +C 3 + 4 +S 5 + +W (line 1) and C (line 2) are complementary bases in a double stranded DNA molecule and S (line 5) are +the symbols of the alphabet used to describe the base pair above the symbol. + +Line 2 must contain only the pipe character, indicating basepairing and line 4 is empty. +The lines must be of equal length and a series ot tests are performed to ensure the integrity of +the alphabet. + +D R IUPAC Single Gaps DNA / RNA +N N extended Strand - = bond mismatches +A A DNA / RNA + • = empty + +GATC UA RYMKSWHBVDN GATC••••U• -----AGCTU AAACCCGGGTTTUUUGCT +|||| || ||||||||||| |||||||||| |||||||||| |||||||||||||||||| +CTAG AU YRKMSWDVBHN ••••CTAG•U AGCTU----- ACGACTAGTCGTGCTUUU + +GATC UO RYMKSWHBVDN PEXIQFZJ$% 0123456789 !#{}&*()<>@:?[]=_; + +""" + +# An alias for whitespace +emptyspace = chr(32) + +un_ambiguous_ds_dna = """\ +GATC +|||| +CTAG + +GATC +""" + +ds_rna = """\ +UA +|| +AU + +UO +""" + +ambiguous_ds_dna = """\ +RYMKSWHBVDN +||||||||||| +YRKMSWDVBHN + +RYMKSWHBVDN +""" + +# The dots in the string below are replaced by emptyspace +single_stranded_dna_rna = """\ +GATC••••U• +|||||||||| +••••CTAG•U + +PEXIQFZJ$% +""".replace( + "•", emptyspace +) + + +mismatched_dna_rna = """\ +AAACCCGGGTTTUUUGCT +|||||||||||||||||| +ACGACTAGTCGTGCTUUU + +!#{}&*()<>@:?[]=_; +""" + +loops_dna_rna = """\ +-----AGCTU +|||||||||| +AGCTU----- + +0123456789 +""" + + +codestrings = { + "un_ambiguous_ds_dna": un_ambiguous_ds_dna, + "ambiguous_ds_dna": ambiguous_ds_dna, + "ds_rna": ds_rna, + "single_stranded_dna_rna": single_stranded_dna_rna, + "mismatched_dna_rna": mismatched_dna_rna, + "loops_dna_rna": loops_dna_rna, +} + +# This string contains ascii letters not used in the alphabet +letters_not_in_dscode = "lL\"',-./\\^`|+~" + +for name, codestring in codestrings.items(): + + # This loops all codestrings and checks for consistency of format. + lines = codestring.splitlines() + + assert len(lines) == 5, f"{name} does not have 5 lines" + + # We want the Watson, Crick and Symbol lines only + # Second line has to be pipes ("|") and fourth has to be empty + + watsn, pipes, crick, empty, symbl = lines + + assert all( + ln.isascii() for ln in (watsn, crick, symbl) + ), f"{name} has non-ascii letters" + + assert all( + ln.isupper() for ln in (watsn, crick, symbl) if ln.isalpha() + ), f"{name} has non-uppercase letters" + + # check so that pipes contain only "|" + assert set(pipes) == set("|"), f"{name} has non-pipe character(s) in line 2" + + # check so strings are the same length + assert all( + len(ln) == len(watsn) for ln in (watsn, pipes, crick, symbl) + ), f"{name} has lines of unequal length" + + # These characters are not used. + assert not any( + [letter in letters_not_in_dscode for letter in symbl] + ), f"{name} has chars outside alphabet" + +""" +The `codes` dictionary is a dict of dicts containing the information of the +code strings in the form if a dict with string names as keys, each containing a +dict wit this structure: (Watson letter, Crick letter): dscode symbol +""" + +codes = dict() + +for name, codestring in codestrings.items(): + + lines = codestring.splitlines() + + watsons, _, cricks, _, symbols = lines + + # d is an alias of codes[name] used in this loop for code clarity. + codes[name] = d = dict() + + for watson, crick, symbol in zip(watsons, cricks, symbols): + d[watson, crick] = symbol + +del d + +basepair_dict = ( + codes["un_ambiguous_ds_dna"] + | codes["ambiguous_ds_dna"] + | codes["ds_rna"] + | codes["single_stranded_dna_rna"] + # | codes["mismatched_dna_rna"] + # | codes["loops_dna_rna"] +) + +annealing_dict = dict() + +""" +The annealing_dict_of_str is constructed below. It contains the information needed +to tell if two DNA fragments (like a and b below) can anneal. This of cource only concerns +single stranded regions. + +The dict has the form (x, y): s +Where x and y are bases in a and b and the symbol s is the resulting symbol for the base pair +that is formed. The letters x and y are from the values of the codes["single_stranded_dna_rna"] +dictionary. + +For, example: One key-value pair is ('P', 'Q'): 'G' which matches the first +of the four new base pairings formed between a and b in the example below. + + +(a) +gggPEXI (dscode for a) + +gggGATC +ccc + aaa (b) + CTAGttt + + QFZJaaa (dscode for b) + + +gggGATCaaa (annealing product between a and b) +cccCTAGttt + +This loops through the base pairs where the upper or lower +positions are empty. (w, c), s would be ("G", " "), "P" +in the first iteration. +""" + +temp = codes["un_ambiguous_ds_dna"] | codes["ds_rna"] + +# Alias to make the code below more readable. +d = codes["single_stranded_dna_rna"] + +for (x, y), symbol in d.items(): + if y == emptyspace: + other = next(b for a, b in temp if a == x) + symbol_other = d[emptyspace, other] + annealing_dict[symbol, symbol_other] = temp[x, other] + annealing_dict[symbol_other, symbol] = temp[x, other] + elif x == emptyspace: + other = next(a for a, b in temp if b == y) + symbol_other = d[other, emptyspace] + annealing_dict[symbol, symbol_other] = temp[other, y] + annealing_dict[symbol_other, symbol] = temp[other, y] + else: + raise ValueError("This should not happen") + +del d, temp + +temp = {} + +for (x, y), symbol in annealing_dict.items(): + + temp[x, emptyspace] = x + temp[emptyspace, y] = y + +annealing_dict_w_holes = annealing_dict | temp + +del temp + +""" +A collection of translation tables are a practical way to obtain Watson and Crick +from dscode or the reverse complement strands when needed. + +These are meant to be used by the bytes.translate method. + + +The translation table "complement_table_for_dscode" is used to obtain the +complement of a DNA sequence in dscode format. +""" + +complement_dict_for_dscode = { + s: basepair_dict[c, w] for (w, c), s in basepair_dict.items() +} + +from_letters = "".join(complement_dict_for_dscode.keys()) +to_letters = "".join(complement_dict_for_dscode.values()) + +from_letters += from_letters.lower() +to_letters += to_letters.lower() + +complement_table_for_dscode = bytes.maketrans( + from_letters.encode("ascii"), to_letters.encode("ascii") +) + +""" +dscode_to_watson_table and dscode_to_crick_table are used to obtain the Watson +and (reverse) Crick strands from dscode. Four extra letters are added to the +table and used in the pydna.dseq.representation function. These are used +to add range indicators ("..") in the watson or crick strings for +representation of long sequences. + +The four letters are placeholder1, placeholder2, interval, empty_bs +""" + +dscode_sense = "" +dscode_compl = "" +watson = "" +crick = "" + +for (w, c), s in basepair_dict.items(): + dscode_sense += s + dscode_compl += basepair_dict[c, w] + watson += w + crick += c + +dscode_sense += dscode_sense.lower() +dscode_compl += dscode_compl.lower() +watson += watson.lower() +crick += crick.lower() + +placeholder1 = "~" +placeholder2 = "+" +interval = "." + +assert placeholder1 in letters_not_in_dscode +assert placeholder2 in letters_not_in_dscode +assert interval in letters_not_in_dscode + +dscode_to_watson_table = bytes.maketrans( + (dscode_sense + placeholder1 + placeholder2).encode("ascii"), + (watson + emptyspace + interval).encode("ascii"), +) + +dscode_to_crick_table = bytes.maketrans( + (dscode_sense + placeholder1 + placeholder2).encode("ascii"), + (crick + interval + emptyspace).encode("ascii"), +) + + +watson_tail_letter_dict = { + w: s for (w, c), s in codes["single_stranded_dna_rna"].items() if c.isspace() +} + +from_letters = "".join(watson_tail_letter_dict.keys()) +to_letters = "".join(watson_tail_letter_dict.values()) + +from_letters += from_letters.lower() +to_letters += to_letters.lower() + +dscode_to_crick_tail_table = bytes.maketrans( + from_letters.encode("ascii"), to_letters.encode("ascii") +) +# crick_tail_to_dscode_table = bytes.maketrans(to_letters.encode("ascii"),from_letters.encode("ascii")) + + +from_letters_full = five_prime_ss_letters = to_letters +to_letters_full = from_letters + + +d = codes["single_stranded_dna_rna"] + +crick_tail_letter_dict = { + complement_dict_for_dscode[c]: s for (w, c), s in d.items() if w.isspace() +} + +del d + +from_letters = "".join(crick_tail_letter_dict.keys()) +to_letters = "".join(crick_tail_letter_dict.values()) + +from_letters += from_letters.lower() +to_letters += to_letters.lower() + +dscode_to_watson_tail_table = bytes.maketrans( + from_letters.encode("ascii"), to_letters.encode("ascii") +) +# watson_tail_to_dscode_table = bytes.maketrans(to_letters.encode("ascii"), from_letters.encode("ascii")) + +three_prime_ss_letters = to_letters +from_letters_full += to_letters +to_letters_full += from_letters + +dscode_to_full_sequence_table = bytes.maketrans( + from_letters_full.encode("ascii"), to_letters_full.encode("ascii") +) + + +# This loop adds upper and lower case symbols +mixed_case_dict = {} + +for (x, y), symbol in basepair_dict.items(): + mixed_case_dict[x.lower(), y.lower()] = symbol.lower() + mixed_case_dict[x.lower(), y.upper()] = symbol.lower() + mixed_case_dict[x.upper(), y.lower()] = symbol.upper() + + if x == emptyspace: + mixed_case_dict[x, y.lower()] = symbol.lower() + mixed_case_dict[x, y.upper()] = symbol.upper() + if y == emptyspace: + mixed_case_dict[x.lower(), y] = symbol.lower() + mixed_case_dict[x.upper(), y] = symbol.upper() + +# Add mixed case entries to the dict +basepair_dict.update(mixed_case_dict) + +# This loop adds upper and lower case symbols +mixed_case_dict = {} + +for (x, y), symbol in annealing_dict.items(): + mixed_case_dict[x.lower(), y.lower()] = symbol.lower() + mixed_case_dict[x.lower(), y.upper()] = symbol.lower() + mixed_case_dict[x.upper(), y.lower()] = symbol.upper() +# Add mixed case entries to the dict +annealing_dict.update(mixed_case_dict) + +ds_letters = ( + "".join(codes["un_ambiguous_ds_dna"].values()) + + "".join(codes["ds_rna"].values()) + + "".join(codes["ambiguous_ds_dna"].values()) +) + +ss_letters_watson = "".join( + s for (w, c), s in codes["single_stranded_dna_rna"].items() if c == emptyspace +) +ss_letters_crick = "".join( + s for (w, c), s in codes["single_stranded_dna_rna"].items() if w == emptyspace +) + +ds_letters += ds_letters.lower() +ss_letters_watson += ss_letters_watson.lower() +ss_letters_crick += ss_letters_crick.lower() + + +""" +The dict of regexes below cover IUPAC Ambiguity Code complements +and is used in the amplify module. +""" +iupac_compl_regex = { + "A": "(?:T|U)", + "C": "(?:G)", + "G": "(?:C)", + "T": "(?:A)", + "U": "(?:A)", + "R": "(?:T|C|Y)", + "Y": "(?:G|A|R)", + "S": "(?:G|C|S)", + "W": "(?:A|T|W)", + "K": "(?:C|AM)", + "M": "(?:T|G|K)", + "B": "(?:C|G|A|V)", + "D": "(?:A|C|T|H)", + "H": "(?:A|G|T|D)", + "V": "(?:T|C|G|B)", + "N": "(?:A|G|C|T|N)", +} + + +# This loop adds upper and lower case symbols +# mixed_case_dict = {} + +for (x, y), symbol in annealing_dict_w_holes.items(): + mixed_case_dict[x.lower(), y.lower()] = symbol.lower() + mixed_case_dict[x.lower(), y.upper()] = symbol.lower() + mixed_case_dict[x.upper(), y.lower()] = symbol.upper() +# Add mixed case entries to the dict +annealing_dict_w_holes.update(mixed_case_dict) + + +def get_parts(datastring: str): + + m = _re.match( + f"([{ss_letters_watson}]*)" # capture group 0 ssDNA in watson strand + f"([{ss_letters_crick}]*)" # capture group 1 ssDNA in crick strand + f"(?=[{ds_letters}])" # no capture, positive lookahead for dsDNA + "(.*)" # captures group 2 everything in the middle + f"(?<=[{ds_letters}])" # no capture,positive look behind for dsDNA + f"([{ss_letters_watson}]*)" # capture group 3 ssDNA in watson strand + f"([{ss_letters_crick}]*)|" # capture group 4 ssDNA in crick strand + f"([{ss_letters_watson}]+)|" # capture group 5 if data contains only upper strand + f"([{ss_letters_crick}]+)", # capture group 6 if data contains only lower strand + datastring, + ) + + result = m.groups() if m else (None, None, None, None, None, None, None) + + result = ["" if e is None else e for e in result] + + field_names = ( + "sticky_left5", + "sticky_left3", + "middle", + "sticky_right3", + "sticky_right5", + "single_watson", + "single_crick", + ) + + fragment = namedtuple("fragment", field_names) + + return fragment(*result) + + +def dsbreaks(data: str): + + wl = _re.escape(five_prime_ss_letters) + cl = _re.escape(three_prime_ss_letters) + + breaks = [] + regex = ( + "(.{0,3})" # context if present + f"([{wl}][{cl}]|[{cl}][{wl}])" # ss chars next to each other + "(.{0,3})" # context if present + ) + for mobj in _re.finditer(regex, data): + chunk = mobj.group() + w, c = representation_tuple(chunk) + breaks.append(f"[{mobj.start()}:{mobj.end()}]\n{w}\n{c}\n") + return breaks + + +def representation_tuple( + datastring: str = "", length_limit_for_repr: int = 30, chunk: int = 4 +): + """ + Two line string representation of a sequence of dscode symbols. + + See pydna.alphabet module for the definition of the pydna dscode + alphabet. The dscode has a symbol (ascii) character for base pairs + and single stranded DNA. + + This function is used by the Dseq.__repr__() method. + + Parameters + ---------- + data : TYPE, optional + DESCRIPTION. The default is "". + + Returns + ------- + str + A two line string containing The Watson and Crick strands. + + """ + + ( + sticky_left5, + sticky_left3, + middle, + sticky_right5, + sticky_right3, + single_watson, + single_crick, + ) = get_parts(datastring) + + if len(datastring) > length_limit_for_repr: + """ + We need to shorten the repr if the sequence is longer than + limit imposed by length_limit_for_repr. + + The representation has three parts, so we divide by three for each part. + + Long DNA strands are interrupted by interval notation, like agc..att + where the two dots indicate intervening hidden sequence. + + + Dseq(-71) + GAAA..AATCaaaa..aaaa + tttt..ttttCTAA..AAAG + + placeholder1, placeholder2 are two letters that are replaced by + interval characters in the upper or lower strands by the translation + """ + + part_limit = length_limit_for_repr // 3 + + if len(sticky_left5) > part_limit: + sticky_left5 = ( + sticky_left5[:chunk] + placeholder2 * 2 + sticky_left5[-chunk:] + ) + + if len(sticky_left3) > part_limit: + sticky_left3 = ( + sticky_left3[:chunk] + placeholder1 * 2 + sticky_left3[-chunk:] + ) + + if len(middle) > part_limit: + middle = middle[:4] + interval * 2 + middle[-4:] + + if len(sticky_right5) > part_limit: + sticky_right5 = ( + sticky_right5[:chunk] + placeholder2 * 2 + sticky_right5[-chunk:] + ) + + if len(sticky_right3) > part_limit: + sticky_right3 = ( + sticky_right3[:chunk] + placeholder1 * 2 + sticky_right3[-chunk:] + ) + + """ + The processed string contains + """ + processed_dscode = (sticky_left5 or sticky_left3) + middle + ( + sticky_right5 or sticky_right3 + ) or single_watson + single_crick + + watson = processed_dscode.translate(dscode_to_watson_table).rstrip() + crick = processed_dscode.translate(dscode_to_crick_table).rstrip() + + return watson, crick diff --git a/src/pydna/amplify.py b/src/pydna/amplify.py index f2946b4c..0b097d60 100644 --- a/src/pydna/amplify.py +++ b/src/pydna/amplify.py @@ -15,9 +15,7 @@ from pydna._pretty import pretty_str as _pretty_str from pydna.utils import flatten as _flatten - -# from pydna.utils import memorize as _memorize -from pydna.utils import rc as _rc, shift_feature as _shift_feature +from pydna.utils import shift_feature as _shift_feature from pydna.amplicon import Amplicon as _Amplicon from pydna.primer import Primer as _Primer from pydna.seqrecord import SeqRecord as _SeqRecord @@ -26,10 +24,11 @@ from Bio.SeqFeature import SimpleLocation as _SimpleLocation from Bio.SeqFeature import CompoundLocation as _CompoundLocation from pydna.seq import Seq as _Seq -import itertools as _itertools import re as _re import copy as _copy import operator as _operator +from pydna.alphabet import iupac_compl_regex as _iupac_compl_regex +from pydna.utils import anneal_from_left as _anneal_from_left # import os as _os @@ -37,25 +36,6 @@ # _module_logger = _logging.getLogger("pydna." + __name__) -_table = { # IUPAC Ambiguity Codes for Nucleotide Degeneracy and U for Uracile - "A": "A", - "C": "C", - "G": "G", - "T": "T", - "U": "A", # XXX - "R": "(A|G)", - "Y": "(C|T)", - "S": "(G|C)", - "W": "(A|T)", - "K": "(G|T)", - "M": "(A|C)", - "B": "(C|G|T)", - "D": "(A|G|T)", - "H": "(A|C|T)", - "V": "(A|C|G)", - "N": "(A|G|C|T)", -} - def _annealing_positions(primer, template, limit): """Finds the annealing position(s) for a primer on a template where the @@ -70,13 +50,14 @@ def _annealing_positions(primer, template, limit): <- - - - - - - - - - template - - - - - - - - - - - - - > - <------- start (int) ------> - 5'-...gctactacacacgtactgactgcctccaagatagagtcagtaaccacactcgat...3' + < ----- start = 26 ------> + 5'- gctactacacacgtactgactgcctccaagatagagtcagtaaccacactcgatag...3' |||||||||||||||||||||||||||||||||||||||||||||||| 3'-gttctatctcagtcattggtgtATAGTG-5' <-footprint length --> + Parameters ---------- primer : string @@ -85,7 +66,7 @@ def _annealing_positions(primer, template, limit): template : string The template sequence 5'-3' - limit : int = 15, optional + limit : int footprint needs to be at least of length limit. Returns @@ -94,32 +75,37 @@ def _annealing_positions(primer, template, limit): [ (start1, footprint1), (start2, footprint2) ,..., ] """ + # under_tail + # anchor AACCACACTCGAT + # CAAGATAGAGTCAGT + # ||||||||||||||| + # gttctatctcagtca + # ttggtgtATAGTG revprimer + # tail + # + # | <- limit -> | + # return empty list if primer too short if len(primer) < limit: return [] - prc = _rc(primer) + revprimer = primer[::-1] # head is minimum part of primer that must anneal - head = prc[:limit].upper() + head = revprimer[:limit].upper() + tail = revprimer[limit:].upper() # Make regex pattern that reflects extended IUPAC DNA code - head = "".join(_table[key] for key in head) - - positions = [m.start() for m in _re.finditer(f"(?={head})", template, _re.I)] - - if positions: - tail = prc[limit:].lower() - length = len(tail) - results = [] - for match_start in positions: - tm = template[match_start + limit : match_start + limit + length].lower() - footprint = len( - list(_itertools.takewhile(lambda x: x[0] == x[1], zip(tail, tm))) - ) - results.append((match_start, footprint + limit)) - return results - return [] + head_regex = "".join(_iupac_compl_regex[key] for key in head) + primer_regex = f"(?:({head_regex})(.{{0,{len(primer) - limit}}}))" + + results = [] + for m in _re.finditer(primer_regex, template.upper()): + anchor, under_tail = m.groups() + match_start = m.start() + match_extension = _anneal_from_left(tail, under_tail[::-1]) + results.append((match_start, limit + match_extension)) + return results # class _Memoize(type): diff --git a/src/pydna/assembly2.py b/src/pydna/assembly2.py index 2e3fdea3..3c791961 100644 --- a/src/pydna/assembly2.py +++ b/src/pydna/assembly2.py @@ -7,7 +7,8 @@ import networkx as _nx import itertools as _itertools from Bio.SeqFeature import SimpleLocation, Location -from Bio.Seq import reverse_complement + +# from Bio.Seq import reverse_complement from Bio.Restriction.Restriction import RestrictionBatch import regex import copy @@ -606,34 +607,35 @@ def primer_template_overlap( def fill_left(seq: _Dseq) -> _Dseq: """Fill the left overhang of a sequence with the complementary sequence.""" - new_watson = seq.watson - new_crick = seq.crick + # new_watson = seq.watson + # new_crick = seq.crick - # Watson 5' overhang - if seq.ovhg < 0: - new_crick = new_crick + reverse_complement(seq.watson[: -seq.ovhg]) - # Crick 5' overhang - elif seq.ovhg > 0: - new_watson = reverse_complement(seq.crick[-seq.ovhg :]) + new_watson + # # Watson 5' overhang + # if seq.ovhg < 0: + # new_crick = new_crick + reverse_complement(seq.watson[: -seq.ovhg]) + # # Crick 5' overhang + # elif seq.ovhg > 0: + # new_watson = reverse_complement(seq.crick[-seq.ovhg :]) + new_watson + # if _Dseq(new_watson, new_crick, 0) != seq.cast_to_ds_left(): - return _Dseq(new_watson, new_crick, 0) + return seq.cast_to_ds_left() def fill_right(seq: _Dseq) -> _Dseq: """Fill the right overhang of a sequence with the complementary sequence.""" - new_watson = seq.watson - new_crick = seq.crick + # new_watson = seq.watson + # new_crick = seq.crick - # Watson 3' overhang - watson_ovhg = seq.watson_ovhg() - if watson_ovhg < 0: - new_watson = new_watson + reverse_complement(seq.crick[:-watson_ovhg]) + # # Watson 3' overhang + # watson_ovhg = seq.watson_ovhg() + # if watson_ovhg < 0: + # new_watson = new_watson + reverse_complement(seq.crick[:-watson_ovhg]) - # Crick 3' overhang - elif watson_ovhg > 0: - new_crick = reverse_complement(seq.watson[-watson_ovhg:]) + new_crick + # # Crick 3' overhang + # elif watson_ovhg > 0: + # new_crick = reverse_complement(seq.watson[-watson_ovhg:]) + new_crick - return _Dseq(new_watson, new_crick, seq.ovhg) + return seq.cast_to_ds_right() # _Dseq(new_watson, new_crick, seq.ovhg) def fill_dseq(seq: _Dseq) -> _Dseq: @@ -797,8 +799,12 @@ def assemble( ] # Join the left sequence including the overlap with the right sequence without the overlap # we use fill_right / fill_left so that it works for ligation of sticky ends + out_dseqrecord = _Dseqrecord( - fill_right(out_dseqrecord.seq) + fill_left(fragment.seq)[overlap:], + fill_right(out_dseqrecord.seq) + + fill_left(fragment.seq)[ + overlap: + ], # FIXME: This is wrong for PCR. Both primers get incorporated into the product. features=out_dseqrecord.features + new_features, ) diff --git a/src/pydna/design.py b/src/pydna/design.py index 446daa05..f7a4544b 100755 --- a/src/pydna/design.py +++ b/src/pydna/design.py @@ -806,3 +806,81 @@ def circular_assembly_fragments(f, overlap=35, maxlink=40): stacklevel=2, ) return assembly_fragments(f, overlap=overlap, maxlink=maxlink, circular=True) + + +""" + + + + + overlap + _____15______ + | | +TTAGAAGTAGGGATAGTCCCAAACCTCACTTACCACTGCCAATAAGGGG | +AATCTTCATCCCTATCAGGGTTTGGAGTGAATGGTGACGGTTATTCCCC | + | aagccgaactgggccagacaacccggcgctaacgcactca + 44 ttcggcttgacccggtctgttgggccgcgattgcgtgagt + | + 9 + + + +TTAGAAGTAGGGATAGTCCCAAACCTCACTTACCACTGCCAATAAGGGG 9 + |||||||||||||||| | + GTGACGGTTATUCCCCTTCGGCTTGA + +AATCTTCATCCCTATCAGGGTTTGGAGTGAATGGTGACGGTTATTCCCC + + aagccgaactgggccagacaacccggcgctaacgcactca +AGGGGAAGCCGAACUGGGC + AGGGGAAGCCGAACUGGGC + | |||||||||||||| + 44 ttcggcttgacccggtctgttgggccgcgattgcgtgagt + +""" + + +def user_assembly_fragments(f, overlap=35, maxlink=40, circular=False): + from itertools import pairwise + from pydna.dseqrecord import Dseqrecord + import re + from pydna.primer import Primer + from pydna.amplify import pcr + + a1 = Dseqrecord("TTAGAAGTAGGGATAGTCCCAAACCTCACTTACCACTGCCAATAAGGGG") + b1 = Dseqrecord("AAGCCGAACTGGGCCAGACAACCCGGCGCTAACGCACTCA") + fragments = [a1, b1] + amplicons = [] + + for fragment in fragments: + amplicons.append(primer_design(fragment)) + + for ths, nxt in pairwise(amplicons): + + A_positions = [m.start() for m in re.finditer("A|a", str(ths.seq))] + T_positions = [m.start() for m in re.finditer("T|t", str(nxt.seq))] + + for x, y in zip(A_positions[::-1], T_positions): + + sticky_length = y + len(ths) - x + + rp = bytearray(nxt.seq[: y + 1].rc()._data + ths.reverse_primer.seq._data) + fp = bytearray(ths.seq[x:]._data + nxt.forward_primer.seq._data) + + fp[sticky_length] = ord(b"U") + rp[sticky_length] = ord(b"U") + + ths.reverse_primer = Primer(rp) + nxt.forward_primer = Primer(fp) + + break + + a2 = pcr(ths).seq + a2_user = a2.user() + a2_sticky, stuffer = a2_user.melt(14) + + b2 = pcr(nxt).seq + b2_user = b2.user() + stuffer, b2_sticky = b2_user.melt(14) + + assert (a1 + b1).seq == a2_sticky + b2_sticky diff --git a/src/pydna/dseq.py b/src/pydna/dseq.py index 6412a176..6dfd60ae 100644 --- a/src/pydna/dseq.py +++ b/src/pydna/dseq.py @@ -1,10 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2013-2023 by Björn Johansson. All rights reserved. -# This code is part of the Python-dna distribution and governed by its -# license. Please see the LICENSE.txt file that should have been included -# as part of this package. """Provides the Dseq class for handling double stranded DNA sequences. Dseq is a subclass of :class:`Bio.Seq.Seq`. The Dseq class @@ -16,85 +12,184 @@ import copy as _copy -import itertools as _itertools import re as _re import sys as _sys import math as _math +import inspect as _inspect +from typing import List as _List, Tuple as _Tuple, Union as _Union -from pydna.seq import Seq as _Seq -from Bio.Seq import _translate_str, _SeqAbstractBaseClass +from Bio.Restriction import RestrictionBatch as _RestrictionBatch +from Bio.Restriction import CommOnly -from pydna._pretty import pretty_str as _pretty_str from seguid import ldseguid as _ldseguid from seguid import cdseguid as _cdseguid +from pydna.seq import Seq as _Seq +from Bio.Seq import _translate_str, _SeqAbstractBaseClass +from pydna._pretty import pretty_str as _pretty_str from pydna.utils import rc as _rc from pydna.utils import flatten as _flatten from pydna.utils import cuts_overlap as _cuts_overlap +from pydna.alphabet import basepair_dict +from pydna.alphabet import dscode_to_watson_table +from pydna.alphabet import dscode_to_crick_table + +# from pydna.alphabet import watson_tail_to_dscode_table +# from pydna.alphabet import crick_tail_to_dscode_table + +from pydna.alphabet import dscode_to_full_sequence_table +from pydna.alphabet import dscode_to_watson_tail_table +from pydna.alphabet import dscode_to_crick_tail_table + +from pydna.alphabet import get_parts +from pydna.alphabet import representation_tuple +from pydna.alphabet import dsbreaks + from pydna.common_sub_strings import common_sub_strings as _common_sub_strings -from Bio.Restriction import RestrictionBatch as _RestrictionBatch -from Bio.Restriction import CommOnly +from pydna.types import DseqType, EnzymesType, CutSiteType -from .types import DseqType, EnzymesType, CutSiteType +# Sequences larger than this gets a truncated representation. +length_limit_for_repr = 30 -from typing import List as _List, Tuple as _Tuple, Union as _Union + +class CircularBytes(bytes): + """ + A circular bytes sequence: indexing and slicing wrap around index 0. + """ + + def __new__(cls, value: bytes | bytearray | memoryview): + return super().__new__(cls, bytes(value)) + + def __getitem__(self, key): + n = len(self) + if n == 0: + if isinstance(key, slice): + return self.__class__(b"") + raise IndexError("CircularBytes index out of range (empty bytes)") + + if isinstance(key, int): + return super().__getitem__(key % n) + + if isinstance(key, slice): + start, stop, step = key.start, key.stop, key.step + step = 1 if step is None else step + if step == 0: + raise ValueError("slice step cannot be zero") + + if step > 0: + start = 0 if start is None else start + stop = n if stop is None else stop + while stop <= start: + stop += n + rng = range(start, stop, step) + else: + start = (n - 1) if start is None else start + stop = -1 if stop is None else stop + while stop >= start: + stop -= n + rng = range(start, stop, step) + + limit = n if step % n == 0 else n * 2 + out = bytearray() + count = 0 + for i in rng: + out.append(super().__getitem__(i % n)) + count += 1 + if count > limit: + break + return self.__class__(bytes(out)) + + return super().__getitem__(key) + + def cutaround(self, start: int, length: int) -> bytes: + """ + Return a circular slice of given length starting at index `start`. + Can exceed len(self), wrapping around as needed. + + Examples + -------- + s = CircularBytes(b"ABCDE") + assert s.cutaround(3, 7) == b"DEABCDE" + assert s.cutaround(-1, 4) == b"EABC" + """ + n = len(self) + if n == 0 or length <= 0: + return self.__class__(b"") + + start %= n + out = bytearray() + for i in range(length): + out.append(self[(start + i) % n]) + return self.__class__(bytes(out)) class Dseq(_Seq): - """Dseq holds information for a double stranded DNA fragment. + """Dseq describes a double stranded DNA fragment, linear or circular. + + Dseq can be initiated in two ways, using two strings, each representing the + Watson (upper, sense) strand, the Crick (lower, antisense) strand and an + optional value describing the stagger betwen the strands on the left side (ovhg). + + Alternatively, a single string represenation using dsIUPAC codes can be used. + If a single string is used, the letters of that string are interpreted as base + pairs rather than single bases. For example "A" would indicate the basepair + "A/T". An expanded IUPAC code is used where the letters PEXI have been assigned + to GATC on the Watson strand with no paring base on the Crick strand G/"", A/"", + T/"" and C/"". The letters QFZJ have been assigned the opposite base pairs with + an empty Watson strand ""/G, ""/A, ""/T, and ""/C. + + :: + + PEXIGATCQFZJ would indicate the linear double-stranded fragment: + + GATCGATC + CTAGCTAG + - Dseq also holds information describing the topology of - the DNA fragment (linear or circular). Parameters ---------- watson : str - a string representing the watson (sense) DNA strand. + a string representing the Watson (sense) DNA strand or a basepair + represenation. crick : str, optional - a string representing the crick (antisense) DNA strand. + a string representing the Crick (antisense) DNA strand. ovhg : int, optional A positive or negative number to describe the stagger between the - watson and crick strands. + Watson and Crick strands. see below for a detailed explanation. - linear : bool, optional - True indicates that sequence is linear, False that it is circular. - circular : bool, optional True indicates that sequence is circular, False that it is linear. Examples -------- - Dseq is a subclass of the Biopython Seq object. It stores two - strings representing the watson (sense) and crick(antisense) strands. - two properties called linear and circular, and a numeric value ovhg - (overhang) describing the stagger for the watson and crick strand - in the 5' end of the fragment. - - The most common usage is probably to create a Dseq object as a - part of a Dseqrecord object (see :class:`pydna.dseqrecord.Dseqrecord`). + Dseq is a subclass of the Biopython Bio.Seq.Seq class. The constructor + can accept two strings representing the Watson (sense) and Crick(antisense) + DNA strands. These are interpreted as single stranded DNA. There is a check + for complementarity between the strands. - There are three ways of creating a Dseq object directly listed below, but you can also - use the function Dseq.from_full_sequence_and_overhangs() to create a Dseq: + If the DNA molecule is staggered on the left side, an integer ovhg + (overhang) must be given, describing the stagger between the Watson and Crick strand + in the 5' end of the fragment. - Only one argument (string): + Additionally, the optional boolean parameter circular can be given to indicate if the + DNA molecule is circular. - >>> from pydna.dseq import Dseq - >>> Dseq("aaa") - Dseq(-3) - aaa - ttt + The most common usage of the Dseq class is probably not to use it directly, but to + create it as part of a Dseqrecord object (see :class:`pydna.dseqrecord.Dseqrecord`). + This works in the same way as for the relationship between the :class:`Bio.Seq.Seq` and + :class:`Bio.SeqRecord.SeqRecord` classes in Biopython. - The given string will be interpreted as the watson strand of a - blunt, linear double stranded sequence object. The crick strand - is created automatically from the watson strand. + There are multiple ways of creating a Dseq object directly listed below, but you can also + use the function Dseq.from_full_sequence_and_overhangs() to create a Dseq: - Two arguments (string, string): + Two arguments (string, string), no overhang provided: >>> from pydna.dseq import Dseq >>> Dseq("gggaaat","ttt") @@ -102,16 +197,14 @@ class Dseq(_Seq): gggaaat ttt - If both watson and crick are given, but not ovhg an attempt - will be made to find the best annealing between the strands. - There are limitations to this. For long fragments it is quite - slow. The length of the annealing sequences have to be at least - half the length of the shortest of the strands. + If Watson and Crick are given, but not ovhg, an attempt will be made to find the best annealing + between the strands. There are important limitations to this. If there are several ways to + anneal the strands, this will fail. For long fragments it is quite slow. Three arguments (string, string, ovhg=int): - The ovhg parameter is an integer describing the length of the - crick strand overhang in the 5' end of the molecule. + The ovhg parameter is an integer describing the length of the Crick strand overhang on the + left side (the 5' end of Watson strand). The ovhg parameter controls the stagger at the five prime end:: @@ -134,29 +227,29 @@ class Dseq(_Seq): Example of creating Dseq objects with different amounts of stagger: - >>> Dseq(watson="agt", crick="actta", ovhg=-2) + >>> Dseq(watson="att", crick="acata", ovhg=-2) Dseq(-7) - agt - attca - >>> Dseq(watson="agt",crick="actta",ovhg=-1) + att + ataca + >>> Dseq(watson="ata",crick="acata",ovhg=-1) Dseq(-6) - agt - attca - >>> Dseq(watson="agt",crick="actta",ovhg=0) + ata + ataca + >>> Dseq(watson="taa",crick="actta",ovhg=0) Dseq(-5) - agt + taa attca - >>> Dseq(watson="agt",crick="actta",ovhg=1) + >>> Dseq(watson="aag",crick="actta",ovhg=1) Dseq(-5) - agt + aag attca >>> Dseq(watson="agt",crick="actta",ovhg=2) Dseq(-5) agt attca - If the ovhg parameter is specified a crick strand also - needs to be supplied, otherwise an exception is raised. + If the ovhg parameter is specified a Crick strand also needs to be supplied, or + an exception is raised. >>> Dseq(watson="agt", ovhg=2) Traceback (most recent call last): @@ -165,22 +258,21 @@ class Dseq(_Seq): else: ValueError: ovhg defined without crick strand! + The shape or topology of the fragment is set by the circular parameter, True or False (default). - The shape of the fragment is set by circular = True, False - - Note that both ends of the DNA fragment has to be compatible to set - circular = True. - - - >>> Dseq("aaa","ttt") + >>> Dseq("aaa", "ttt", ovhg = 0) # A linear sequence by default Dseq(-3) aaa ttt - >>> Dseq("aaa","ttt",ovhg=0) + >>> Dseq("aaa", "ttt", ovhg = 0, circular = False) # A linear sequence if circular is False Dseq(-3) aaa ttt - >>> Dseq("aaa","ttt",ovhg=1) + >>> Dseq("aaa", "ttt", ovhg = 0, circular = True) # A circular sequence + Dseq(o3) + aaa + ttt + >>> Dseq("aaa", "ttt", ovhg=1, circular = True) Dseq(-4) aaa ttt @@ -210,6 +302,18 @@ class Dseq(_Seq): -4 >>> + + dsIUPAC [#]_ is an nn extension to the IUPAC alphabet used to describe ss regions: + + :: + + aaaGATC GATCccc ad-hoc representations + CTAGttt gggCTAG + + QFZJaaaPEXI PEXIcccQFZJ dsIUPAC + + + Coercing to string >>> str(a) @@ -295,8 +399,6 @@ class Dseq(_Seq): """ - trunc = 30 - def __init__( self, watson: _Union[str, bytes], @@ -306,35 +408,67 @@ def __init__( pos=0, ): if isinstance(watson, bytes): - watson = watson.decode("ASCII") + # watson is decoded to a string if needed. + watson = watson.decode("ascii") if isinstance(crick, bytes): - crick = crick.decode("ASCII") + # crick is decoded to a string if needed. + crick = crick.decode("ascii") if crick is None: if ovhg is not None: - raise ValueError("ovhg defined without crick strand!") - crick = _rc(watson) - ovhg = 0 - self._data = bytes(watson, encoding="ASCII") + raise ValueError("ovhg (overhang) defined without a crick strand.") + """ + Giving only the watson string implies inferring the Crick complementary strand + from the Watson sequence. The watson string can contain dscode letters wich will + be interpreted as outlined in the pydna.alphabet module. + + The _data property must be a byte string for compatibility with + Biopython Bio.Seq.Seq + """ + data = watson + self._data = data.encode("ascii") - else: # crick strand given - if ovhg is None: # ovhg not given + else: + """ + Crick strand given, ovhg is optional. An important consequence is that the + watson and crick strands are interpreted as single stranded DNA that is + supposed to anneal. + + If ovhg was not given, we try to guess the value below. This will fail + if there are two or more ways to anneal with equal length of the double + stranded part. + """ + if ovhg is None: # ovhg not given, try to guess from sequences + limit = int(_math.log(len(watson)) / _math.log(4)) olaps = _common_sub_strings( str(watson).lower(), str(_rc(crick).lower()), - int(_math.log(len(watson)) / _math.log(4)), + limit, ) + + """No overlaps found, strands do not anneal""" if len(olaps) == 0: raise ValueError( - "Could not anneal the two strands." " Please provide ovhg value" + "Could not anneal the two strands." + f" looked for annealing with at least {limit} basepairs" + " Please provide and overhang value (ovhg parameter)" ) - # We extract the positions and length of the first (longest) overlap, since - # common_sub_strings sorts the overlaps by length. - pos_watson, pos_crick, longest_olap_length = olaps[0] + """ + We extract the positions and length of the first (longest) overlap, + since common_sub_strings sorts the overlaps by length, longest first. + """ - # We see if there is another overlap of the same length - if any(olap[2] >= longest_olap_length for olap in olaps[1:]): + (pos_watson, pos_crick, longest_olap_length), *rest = olaps + + """ + We see if there is another overlap of the same length + This means that annealing is ambigous. USer should provide + and ovhg value. + """ + if any( + olap_length >= longest_olap_length for _, _, olap_length in rest + ): raise ValueError( "More than one way of annealing the" " strands. Please provide ovhg value" @@ -342,79 +476,57 @@ def __init__( ovhg = pos_crick - pos_watson - sns = (ovhg * " ") + _pretty_str(watson) - asn = (-ovhg * " ") + _pretty_str(_rc(crick)) - - self._data = bytes( - "".join( - [ - a.strip() or b.strip() - for a, b in _itertools.zip_longest(sns, asn, fillvalue=" ") - ] - ), - encoding="ASCII", - ) + """ + Pad both strands on left side ovhg spaces + a negative number gives no padding, + """ + sense = ovhg * " " + watson + antisense = -ovhg * " " + crick[::-1] + + max_len = max(len(sense), len(antisense)) + + """pad both strands on right side to same size.""" + sense = sense.ljust(max_len) + antisense = antisense.ljust(max_len) + """both strands padded so that bsepairs align""" + assert len(sense) == len(antisense) - else: # ovhg given - if ovhg == 0: - if len(watson) >= len(crick): - self._data = bytes(watson, encoding="ASCII") - else: - self._data = bytes( - watson + _rc(crick[: len(crick) - len(watson)]), - encoding="ASCII", - ) - elif ovhg > 0: - if ovhg + len(watson) > len(crick): - self._data = bytes( - _rc(crick[-ovhg:]) + watson, encoding="ASCII" - ) - else: - self._data = bytes( - _rc(crick[-ovhg:]) - + watson - + _rc(crick[: len(crick) - ovhg - len(watson)]), - encoding="ASCII", - ) - else: # ovhg < 0 - if -ovhg + len(crick) > len(watson): - self._data = bytes( - watson + _rc(crick[: -ovhg + len(crick) - len(watson)]), - encoding="ASCII", - ) - else: - self._data = bytes(watson, encoding="ASCII") + data = [] + + for w, c in zip(sense, antisense): + try: + data.append(basepair_dict[w, c]) + except KeyError as err: + print(f"Base mismatch in representation {err}") + raise ValueError() + data = "".join(data) + self._data = data.encode("ascii") self.circular = circular - self.watson = _pretty_str(watson) - self.crick = _pretty_str(crick) - self.length = len(self._data) - self.ovhg = ovhg self.pos = pos + if circular: + data += data[0:1] + + dsb = dsbreaks(data) + + if dsb: + msg = "".join(dsb) + raise ValueError( + f"Molecule is internally split in {len(dsb)} location(s):\n\n{msg}".strip() + ) + @classmethod - def quick( - cls, - watson: str, - crick: str, - ovhg=0, - circular=False, - pos=0, - ): - obj = cls.__new__(cls) # Does not call __init__ - obj.watson = _pretty_str(watson) - obj.crick = _pretty_str(crick) - obj.ovhg = ovhg + def quick(cls, data: bytes, *args, circular=False, pos=0, **kwargs): + """Fastest way to instantiate an object of the Dseq class. + + No checks of parameters are made. + Does not call Bio.Seq.Seq.__init__() which has lots of time consuming checks. + """ + obj = cls.__new__(cls) obj.circular = circular - obj.length = max(len(watson) + max(0, ovhg), len(crick) + max(0, -ovhg)) obj.pos = pos - wb = bytes(watson, encoding="ASCII") - cb = bytes(crick, encoding="ASCII") - obj._data = ( - _rc(cb[-max(0, ovhg) or len(cb) :]) - + wb - + _rc(cb[: max(0, len(cb) - ovhg - len(wb))]) - ) + obj._data = data return obj @classmethod @@ -422,40 +534,47 @@ def from_string( cls, dna: str, *args, - # linear=True, circular=False, **kwargs, ): - obj = cls.__new__(cls) # Does not call __init__ - obj.watson = _pretty_str(dna) - obj.crick = _pretty_str(_rc(dna)) - obj.ovhg = 0 + obj = cls.__new__(cls) obj.circular = circular - # obj._linear = linear - obj.length = len(dna) obj.pos = 0 - obj._data = bytes(dna, encoding="ASCII") + obj._data = dna.encode("ascii") return obj @classmethod def from_representation(cls, dsdna: str, *args, **kwargs): - obj = cls.__new__(cls) # Does not call __init__ - w, c, *r = [ln for ln in dsdna.splitlines() if ln] - ovhg = obj.ovhg = len(w) - len(w.lstrip()) - (len(c) - len(c.lstrip())) - watson = obj.watson = _pretty_str(w.strip()) - crick = obj.crick = _pretty_str(c.strip()[::-1]) + obj = cls.__new__(cls) obj.circular = False - # obj._linear = True - obj.length = max(len(watson) + max(0, ovhg), len(crick) + max(0, -ovhg)) obj.pos = 0 - wb = bytes(watson, encoding="ASCII") - cb = bytes(crick, encoding="ASCII") - obj._data = ( - _rc(cb[-max(0, ovhg) or len(cb) :]) - + wb - + _rc(cb[: max(0, len(cb) - ovhg - len(wb))]) - ) - return obj + clean = _inspect.cleandoc("\n" + dsdna) + watson, crick = [ + ln + for ln in clean.splitlines() + if ln.strip() and not ln.strip().startswith("Dseq(") + ] + ovhgw = len(watson) - len(watson.lstrip()) + ovhgc = -(len(crick) - len(crick.lstrip())) + + ovhg = ovhgw or ovhgc + + watson = watson.strip() + crick = crick.strip()[::-1] + + return Dseq(watson, crick, ovhg) + + # watson = f"{watson:<{len(crick)}}" + # crick = f"{crick:<{len(watson)}}" + # data = bytearray() + # for w, c in zip(watson, crick): + # try: + # data.extend(basepair_dict[w, c]) + # except KeyError as err: + # print(f"Base mismatch in representation {err}") + # raise + # obj._data = bytes(data) + # return obj @classmethod def from_full_sequence_and_overhangs( @@ -522,107 +641,82 @@ def from_full_sequence_and_overhangs( return Dseq(watson, crick=crick, ovhg=crick_ovhg) - # @property - # def ovhg(self): - # """The ovhg property. This cannot be set directly, but is a - # consequence of how the watson and crick strands anneal to - # each other""" - # return self._ovhg - - # @property - # def linear(self): - # """The linear property can not be set directly. - # Use an empty slice [:] to create a linear object.""" - # return self._linear - - # @property - # def circular(self): - # """The circular property can not be set directly. - # Use :meth:`looped` to create a circular Dseq object""" - # return self._circular + @property + def watson(self) -> str: + """ + The watson (upper) strand of the double stranded fragment 5'-3'. - def mw(self) -> float: - """This method returns the molecular weight of the DNA molecule - in g/mol. The following formula is used:: + Returns + ------- + TYPE + DESCRIPTION. - MW = (A x 313.2) + (T x 304.2) + - (C x 289.2) + (G x 329.2) + - (N x 308.9) + 79.0 """ - nts = (self.watson + self.crick).lower() + return self._data.decode("ascii").translate(dscode_to_watson_table).strip() - return ( - 313.2 * nts.count("a") - + 304.2 * nts.count("t") - + 289.2 * nts.count("c") - + 329.2 * nts.count("g") - + 308.9 * nts.count("n") - + 79.0 - ) + @property + def crick(self) -> str: + """ + The crick (lower) strand of the double stranded fragment 5'-3'. - def upper(self: DseqType) -> DseqType: - """Return an upper case copy of the sequence. + Returns + ------- + TYPE + DESCRIPTION. - >>> from pydna.dseq import Dseq - >>> my_seq = Dseq("aAa") - >>> my_seq - Dseq(-3) - aAa - tTt - >>> my_seq.upper() - Dseq(-3) - AAA - TTT + """ + return self._data.decode("ascii").translate(dscode_to_crick_table).strip()[::-1] + + @property + def ovhg(self) -> int: + """ + The 5' overhang of the lower strand compared the the upper. + + See module docstring for more information. Returns ------- - Dseq - Dseq object in uppercase + TYPE + DESCRIPTION. - See also - -------- - pydna.dseq.Dseq.lower + """ + parts = get_parts(self._data.decode("ascii")) + if parts.single_watson or parts.single_crick: + return None + return -len(parts.sticky_left5) or len(parts.sticky_left3) + def to_blunt_string(self): """ - return self.quick( - self.watson.upper(), - self.crick.upper(), - ovhg=self.ovhg, - # linear=self.linear, - circular=self.circular, - pos=self.pos, - ) + The sequence as a blunt ended string. - def lower(self: DseqType) -> DseqType: - """Return a lower case copy of the sequence. - - >>> from pydna.dseq import Dseq - >>> my_seq = Dseq("aAa") - >>> my_seq - Dseq(-3) - aAa - tTt - >>> my_seq.lower() - Dseq(-3) - aaa - ttt Returns ------- - Dseq - Dseq object in lowercase + TYPE + DESCRIPTION. - See also - -------- - pydna.dseq.Dseq.upper """ - return self.quick( - self.watson.lower(), - self.crick.lower(), - ovhg=self.ovhg, - # linear=self.linear, - circular=self.circular, - pos=self.pos, + return self._data.decode("ascii").translate(dscode_to_full_sequence_table) + + __str__ = to_blunt_string + + def mw(self) -> float: + """This method returns the molecular weight of the DNA molecule + in g/mol. The following formula is used:: + + MW = (A x 313.2) + (T x 304.2) + + (C x 289.2) + (G x 329.2) + + (N x 308.9) + 79.0 + """ + nts = (self.watson + self.crick).lower() + + return ( + 313.2 * nts.count("a") + + 304.2 * nts.count("t") + + 289.2 * nts.count("c") + + 329.2 * nts.count("g") + + 308.9 * nts.count("n") + + 79.0 ) def find( @@ -671,54 +765,33 @@ def find( return (_pretty_str(self) + _pretty_str(self)).find(sub, start, end) - def __getitem__(self, sl: slice) -> "Dseq": - """Returns a subsequence. This method is used by the slice notation""" - - if not self.circular: - x = len(self.crick) - self.ovhg - len(self.watson) - - sns = (self.ovhg * " " + self.watson + x * " ")[sl] - asn = (-self.ovhg * " " + self.crick[::-1] + -x * " ")[sl] - - ovhg = max( - (len(sns) - len(sns.lstrip()), -len(asn) + len(asn.lstrip())), key=abs - ) - - return Dseq( - sns.strip(), - asn[::-1].strip(), - ovhg=ovhg, - # linear=True - ) - else: - sl = slice(sl.start or 0, sl.stop or len(self), sl.step) - if sl.start > len(self) or sl.stop > len(self): - return Dseq("") - if sl.start < sl.stop: - return Dseq( - self.watson[sl], - self.crick[::-1][sl][::-1], - ovhg=0, - # linear=True - ) - else: - try: - stp = abs(sl.step) - except TypeError: - stp = 1 - start = sl.start - stop = sl.stop - - w = ( - self.watson[(start or len(self)) :: stp] - + self.watson[: (stop or 0) : stp] - ) - c = ( - self.crick[len(self) - stop :: stp] - + self.crick[: len(self) - start : stp] - ) - - return Dseq(w, c, ovhg=0) # , linear=True) + def __contains__(self, item: [str, bytes]): + if isinstance(item, bytes): + item = item.decode("ascii") + return item in self.to_blunt_string() + + def __getitem__(self, sl: [slice, int]) -> "Dseq": + """Method is used by the slice notation""" + if isinstance(sl, int): + # slice(start, stop, step) where stop = start+1 + sl = slice(sl, sl + 1, 1) + sl = slice(sl.start or 0, sl.stop or len(self), sl.step) + if self.circular: + if sl.start is None and sl.stop is None: + # Returns a linear copy using slice obj[:] + return self.quick(self._data[sl]) + elif sl.start == sl.stop: + # Returns a shifted object + shift = sl.start % len(self) + return self.quick(self._data[shift:] + self._data[:shift]) + elif ( + sl.start > sl.stop + and 0 <= sl.start <= len(self) + and 0 <= sl.stop <= len(self) + ): + # Returns the circular slice spanning the origin + return self.quick(self._data[sl.start :] + self._data[: sl.stop]) + return super().__getitem__(sl) def __eq__(self, other: DseqType) -> bool: """Compare to another Dseq object OR an object that implements @@ -738,85 +811,15 @@ def __eq__(self, other: DseqType) -> bool: same = False return same - def __repr__(self): - """Returns a representation of the sequence, truncated if - longer than 30 bp""" - - if len(self) > Dseq.trunc: - if self.ovhg > 0: - d = self.crick[-self.ovhg :][::-1] - hej = len(d) - if len(d) > 10: - d = "{}..{}".format(d[:4], d[-4:]) - a = len(d) * " " - - elif self.ovhg < 0: - a = self.watson[: max(0, -self.ovhg)] - hej = len(a) - if len(a) > 10: - a = "{}..{}".format(a[:4], a[-4:]) - d = len(a) * " " - else: - a = "" - d = "" - hej = 0 - - x = self.ovhg + len(self.watson) - len(self.crick) - - if x > 0: - c = self.watson[len(self.crick) - self.ovhg :] - y = len(c) - if len(c) > 10: - c = "{}..{}".format(c[:4], c[-4:]) - f = len(c) * " " - elif x < 0: - f = self.crick[:-x][::-1] - y = len(f) - if len(f) > 10: - f = "{}..{}".format(f[:4], f[-4:]) - c = len(f) * " " - else: - c = "" - f = "" - y = 0 - - L = len(self) - hej - y - x1 = -min(0, self.ovhg) - x2 = x1 + L - x3 = -min(0, x) - x4 = x3 + L - - b = self.watson[x1:x2] - e = self.crick[x3:x4][::-1] - - if len(b) > 10: - b = "{}..{}".format(b[:4], b[-4:]) - e = "{}..{}".format(e[:4], e[-4:]) - - return _pretty_str( - "{klass}({top}{size})\n" "{a}{b}{c}\n" "{d}{e}{f}" - ).format( - klass=self.__class__.__name__, - top={False: "-", True: "o"}[self.circular], - size=len(self), - a=a, - b=b, - c=c, - d=d, - e=e, - f=f, - ) + def __repr__(self, lim: int = length_limit_for_repr): - else: - return _pretty_str( - "{}({}{})\n{}\n{}".format( - self.__class__.__name__, - {False: "-", True: "o"}[self.circular], - len(self), - self.ovhg * " " + self.watson, - -self.ovhg * " " + self.crick[::-1], - ) - ) + header = f"{self.__class__.__name__}({({False: '-', True: 'o'}[self.circular])}{len(self)})" + + w, c = representation_tuple( + self._data.decode("ascii"), length_limit_for_repr=length_limit_for_repr + ) + + return _pretty_str(header + "\n" + w + "\n" + c) def reverse_complement(self) -> "Dseq": """Dseq object where watson and crick have switched places. @@ -839,12 +842,7 @@ def reverse_complement(self) -> "Dseq": >>> """ - return Dseq.quick( - self.crick, - self.watson, - ovhg=len(self.watson) - len(self.crick) + self.ovhg, - circular=self.circular, - ) + return Dseq.quick(_rc(self._data), circular=self.circular) rc = reverse_complement # alias for reverse_complement @@ -876,19 +874,21 @@ def looped(self: DseqType) -> DseqType: Dseq(o8) catcgatc gtagctag - >>> a.T4("t") + >>> b = Dseq("iatcgatj") + >>> b Dseq(-8) catcgat tagctag - >>> a.T4("t").looped() + >>> b.looped() Dseq(o7) catcgat gtagcta - >>> a.T4("a") + >>> c = Dseq("ietcgazj") + >>> c Dseq(-8) catcga agctag - >>> a.T4("a").looped() + >>> c.looped() Traceback (most recent call last): File "", line 1, in File "/usr/local/lib/python2.7/dist-packages/pydna/dsdna.py", line 357, in looped @@ -900,18 +900,11 @@ def looped(self: DseqType) -> DseqType: """ if self.circular: return _copy.deepcopy(self) + type5, sticky5 = self.five_prime_end() type3, sticky3 = self.three_prime_end() if type5 == type3 and str(sticky5) == str(_rc(sticky3)): - nseq = self.__class__.quick( - self.watson, - self.crick[-self.ovhg :] + self.crick[: -self.ovhg], - ovhg=0, - # linear=False, - circular=True, - ) - # assert len(nseq.crick) == len(nseq.watson) - return nseq + return self.__class__(self.watson, circular=True) else: raise TypeError( "DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!" @@ -953,7 +946,7 @@ def tolinear(self: DseqType) -> DseqType: # pragma: no cover raise TypeError("DNA is not circular.\n") selfcopy = _copy.deepcopy(self) selfcopy.circular = False - return selfcopy # self.__class__(self.watson, linear=True) + return selfcopy def five_prime_end(self) -> _Tuple[str, str]: """Returns a tuple describing the structure of the 5' end of @@ -990,20 +983,28 @@ def five_prime_end(self) -> _Tuple[str, str]: pydna.dseq.Dseq.three_prime_end """ - if self.watson and not self.crick: - return "5'", self.watson.lower() - if not self.watson and self.crick: - return "3'", self.crick.lower() - if self.ovhg < 0: - sticky = self.watson[: -self.ovhg].lower() + parts = get_parts(self._data.decode("ascii")) + + sticky5 = parts.sticky_left5.translate(dscode_to_watson_table) + + sticky3 = parts.sticky_left3.translate(dscode_to_crick_table)[::-1] + + single_watson = parts.single_watson.translate(dscode_to_watson_table) + + single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1] + + if sticky := single_watson: + type_ = "single" + elif sticky := single_crick: + type_ = "single" + elif sticky5 == sticky3 == "": + type_, sticky = "blunt", "" + elif sticky := sticky5: type_ = "5'" - elif self.ovhg > 0: - sticky = self.crick[-self.ovhg :].lower() + elif sticky := sticky3: type_ = "3'" - else: - sticky = "" - type_ = "blunt" - return type_, sticky + + return type_, sticky.lower() def three_prime_end(self) -> _Tuple[str, str]: """Returns a tuple describing the structure of the 5' end of @@ -1039,42 +1040,38 @@ def three_prime_end(self) -> _Tuple[str, str]: """ - ovhg = len(self.watson) - len(self.crick) + self.ovhg + parts = get_parts(self._data.decode("ascii")) - if ovhg < 0: - sticky = self.crick[:-ovhg].lower() - type_ = "5'" - elif ovhg > 0: - sticky = self.watson[-ovhg:].lower() - type_ = "3'" - else: - sticky = "" - type_ = "blunt" - return type_, sticky + sticky5 = parts.sticky_right5.translate(dscode_to_crick_table)[::-1] - def watson_ovhg(self) -> int: - """Returns the overhang of the watson strand at the three prime.""" - return len(self.watson) - len(self.crick) + self.ovhg + sticky3 = parts.sticky_right3.translate(dscode_to_watson_table) - def __add__(self: DseqType, other: DseqType) -> DseqType: - """Simulates ligation between two DNA fragments. + single_watson = parts.single_watson.translate(dscode_to_watson_table) + + single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1] - Add other Dseq object at the end of the sequence. - Type error is raised if any of the points below are fulfilled: + if sticky := single_watson: + type_ = "single" + elif sticky := single_crick: + type_ = "single" + elif sticky5 == sticky3 == "": + type_, sticky = "blunt", "" + elif sticky := sticky5: + type_ = "5'" + elif sticky := sticky3: + type_ = "3'" - * one or more objects are circular - * if three prime sticky end of self is not the same type - (5' or 3') as the sticky end of other - * three prime sticky end of self complementary with five - prime sticky end of other. + return type_, sticky.lower() - Phosphorylation and dephosphorylation is not considered. + def watson_ovhg(self) -> int: + """Overhang of the watson strand at the right side.""" + parts = get_parts(self._data.decode("ascii")) + if parts.single_watson or parts.single_crick: + return None + return -len(parts.sticky_right5) or len(parts.sticky_right3) - DNA is allways presumed to have the necessary 5' phospate - group necessary for ligation. + def __add__(self: DseqType, other: DseqType) -> DseqType: - """ - # test for circular DNA if self.circular: raise TypeError("circular DNA cannot be ligated!") try: @@ -1083,20 +1080,20 @@ def __add__(self: DseqType, other: DseqType) -> DseqType: except AttributeError: pass + if not other: + return _copy.deepcopy(self) + + elif not self: + return _copy.deepcopy(other) + self_type, self_tail = self.three_prime_end() other_type, other_tail = other.five_prime_end() if self_type == other_type and str(self_tail) == str(_rc(other_tail)): - answer = Dseq.quick( + return self.__class__( self.watson + other.watson, other.crick + self.crick, self.ovhg ) - elif not self: - answer = _copy.deepcopy(other) - elif not other: - answer = _copy.deepcopy(self) - else: - raise TypeError("sticky ends not compatible!") - return answer + raise TypeError("sticky ends not compatible!") def __mul__(self: DseqType, number: int) -> DseqType: if not isinstance(number, int): @@ -1449,18 +1446,26 @@ def isblunt(self) -> bool: >>> a.isblunt() False """ - return ( - self.ovhg == 0 and len(self.watson) == len(self.crick) and not self.circular + parts = get_parts(self._data.decode("ascii")) + + return not any( + ( + parts.sticky_right5, + parts.sticky_right3, + parts.sticky_left3, + parts.sticky_left5, + self.circular, + ) ) def cas9(self, RNA: str) -> _Tuple[slice, ...]: """docstring.""" - bRNA = bytes(RNA, "ASCII") + bRNA = RNA.encode("ascii") slices = [] cuts = [0] for m in _re.finditer(bRNA, self._data): cuts.append(m.start() + 17) - cuts.append(self.length) + cuts.append(len(self)) slices = tuple(slice(x, y, 1) for x, y in zip(cuts, cuts[1:])) return slices @@ -1471,6 +1476,21 @@ def terminal_transferase(self, nucleotides="a") -> "Dseq": ovhg += len(nucleotides) return Dseq(self.watson + nucleotides, self.crick + nucleotides, ovhg) + def user(self): + """ + USER Enzyme is a mixture of Uracil DNA glycosylase (UDG) and the + DNA glycosylase-lyase Endonuclease VIII. UDG catalyses the excision + of an uracil base, forming an abasic (apyrimidinic) site while leaving + the phosphodiester backbone intact (2,3). + + Returns + ------- + None. + + """ + + return Dseq(self._data.translate(bytes.maketrans(b"UuOo", b"ZzEe"))) + def cut(self: DseqType, *enzymes: EnzymesType) -> _Tuple[DseqType, ...]: """Returns a list of linear Dseq fragments produced in the digestion. If there are no cuts, an empty list is returned. @@ -1676,6 +1696,144 @@ def right_end_position(self) -> _Tuple[int, int]: return len(self) + self.watson_ovhg(), len(self) return len(self), len(self) - self.watson_ovhg() + def get_ss_meltsites(self: DseqType, length) -> _List[CutSiteType]: + + if length < 1: + return () + + regex = ( + f"(?P((?<=[PEXIpexi]))([GATCgatcUuOo]{{1,{length}}})((?=[^QFZJqfzjGATCgatcUuOo])))|" + f"(?P((?<=[QFZJqfzj]))([GATCgatcUuOo]{{1,{length}}})((?=[^PEXIpexiGATCgatcUuOo])))" + ) + + regex = _re.compile(regex.encode("ascii")) + + if self.circular: + + spacer = length + + cutfrom = self._data[-length:] + self._data + self._data[:length] + + else: + + spacer = 0 + + cutfrom = self._data + + watsoncuts = [] + crickcuts = [] + + for m in regex.finditer(cutfrom): + + if m.lastgroup == "watson": + cut1 = m.start() + spacer + cut2 = m.end() + spacer + watsoncuts.append((cut1, cut2)) + else: + assert m.lastgroup == "crick" + cut1 = m.start() + spacer + cut2 = m.end() + spacer + crickcuts.append((cut1, cut2)) + + return watsoncuts, crickcuts + + def get_ds_meltsites(self: DseqType, length) -> _List[CutSiteType]: + """Double stranded DNA melt sites + + Returns a sorted list (by distance from the left) of tuples. Each tuple (`((cut_watson, ovhg), enz)`) + contains a tuple containing the cut position and the overhang value for the enzyme followed by the + an enzyme object. + + """ + + if length < 1: + return () + + regex = ( + f"(?P((?<=[PEXIpexi])|^)([GATCgatcUuOo]{{1,{length}}})((?=[^PEXIpexiGATCgatcUuOo])|$))|" + f"(?P((?<=[QFZJqfzj])|^)([GATCgatcUuOo]{{1,{length}}})((?=[^QFZJqfzjGATCgatcUuOo])|$))" + ) + + regex = _re.compile(regex.encode("ascii")) + + if self.circular: + + spacer = length + + cutfrom = self._data[-length:] + self._data + self._data[:length] + + else: + + spacer = 0 + + cutfrom = self._data + + cuts = [] + + for m in regex.finditer(cutfrom): + + if m.lastgroup == "watson": + cut = (m.end() - spacer, m.end() - m.start()), None + else: + assert m.lastgroup == "crick" + cut = (m.start() - spacer, m.start() - m.end()), None + + cuts.append(cut) + + return cuts + + def cast_to_ds_right(self): + """ + NNNN---- + NNNNCTAG + + NNNNGATC + NNNNCTAG + + NNNNGATC + NNNN---- + + NNNNGATC + NNNNCTAG + + """ + + p = get_parts(self._data.decode("ascii")) + + ds_stuffer = (p.sticky_right5 or p.sticky_right3).translate( + dscode_to_full_sequence_table + ) + + result = (p.sticky_left5 or p.sticky_left3) + p.middle + ds_stuffer + + return self.__class__(result, circular=False) + + def cast_to_ds_left(self): + """ + GATCNNNN + NNNN + + GATCNNNN + CTAGNNNN + + NNNN + CTAGNNNN + + GATCNNNN + CTAGNNNN + + """ + + p = get_parts(self._data.decode("ascii")) + + ds_stuffer = (p.sticky_left5 or p.sticky_left3).translate( + dscode_to_full_sequence_table + ) + + result = ds_stuffer + p.middle + (p.sticky_right5 or p.sticky_right3) + + return self.__class__(result, circular=False) + def get_cut_parameters( self, cut: _Union[CutSiteType, None], is_left: bool ) -> _Tuple[int, int, int]: @@ -1705,6 +1863,64 @@ def get_cut_parameters( # In the right end, the overhang does not matter return *self.right_end_position(), self.watson_ovhg() + def melt(self, length): + if not length or length < 1: + return tuple() + + new, strands = self.shed_ss_dna(length) + + cutsites = new.get_ds_meltsites(length) + + cutsite_pairs = self.get_cutsite_pairs(cutsites) + + result = tuple(new.apply_cut(*cutsite_pair) for cutsite_pair in cutsite_pairs) + + result = tuple([new]) if strands and not result else result + + return strands + result + + def shed_ss_dna(self, length): + + new, strands, intervals = self._shed_ss_dna(length) + + return new, strands + + def _shed_ss_dna(self, length): + + watsonnicks, cricknicks = self.get_ss_meltsites(length) + + watsonstrands = [] + crickstrands = [] + + new = self._data + + for x, y in watsonnicks: + stuffer = new[x:y] + ss = Dseq.quick(stuffer.translate(dscode_to_watson_tail_table)) + new = new[:x] + stuffer.translate(dscode_to_crick_tail_table) + new[y:] + watsonstrands.append((x, y, ss)) + + for x, y in cricknicks: + stuffer = new[x:y] + ss = Dseq.quick(stuffer.translate(dscode_to_crick_tail_table)) + new = new[:x] + stuffer.translate(dscode_to_watson_tail_table) + new[y:] + crickstrands.append((x, y, ss)) + + ordered_strands = sorted(watsonstrands + crickstrands) + + strands = [] + + for x, y, ss in ordered_strands: + seq = ( + ss._data[::-1].translate(dscode_to_watson_table).strip() + or ss._data.translate(dscode_to_crick_table).strip() + ) + strands.append(_Seq(seq)) + + intervals = tuple((x, y) for x, y, ss in ordered_strands) + + return Dseq(new), tuple(strands), intervals + def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq": """Extracts a subfragment of the sequence between two cuts. diff --git a/src/pydna/dseqrecord.py b/src/pydna/dseqrecord.py index fadd9a44..4bd195e2 100644 --- a/src/pydna/dseqrecord.py +++ b/src/pydna/dseqrecord.py @@ -18,7 +18,7 @@ from pydna.utils import flatten as _flatten, location_boundaries as _location_boundaries # from pydna.utils import memorize as _memorize -from pydna.utils import rc as _rc +# from pydna.utils import rc as _rc from pydna.utils import shift_location as _shift_location from pydna.utils import shift_feature as _shift_feature from pydna.common_sub_strings import common_sub_strings as _common_sub_strings @@ -219,9 +219,7 @@ def from_string( obj = cls.__new__(cls) # Does not call __init__ obj._per_letter_annotations = {} obj.seq = _Dseq.quick( - record, - _rc(record), - ovhg=0, + record.encode("ascii"), # linear=linear, circular=circular, ) @@ -258,9 +256,7 @@ def from_SeqRecord( obj.n = n if circular is None: circular = record.annotations.get("topology") == "circular" - obj.seq = _Dseq.quick( - str(record.seq), _rc(str(record.seq)), ovhg=0, circular=circular - ) + obj.seq = _Dseq.quick(record.seq._data, ovhg=0, circular=circular) return obj @property @@ -332,7 +328,7 @@ def add_feature( location = _CompoundLocation( ( - _SimpleLocation(x, self.seq.length, strand=strand), + _SimpleLocation(x, len(self.seq), strand=strand), _SimpleLocation(0, y, strand=strand), ) ) @@ -850,7 +846,7 @@ def __getitem__(self, sl): f for f in answer.features if ( - _location_boundaries(f.location)[1] <= answer.seq.length + _location_boundaries(f.location)[1] <= len(answer.seq) and _location_boundaries(f.location)[0] < _location_boundaries(f.location)[1] ) diff --git a/src/pydna/ligate.py b/src/pydna/ligate.py index 280bb5ba..50d1e0c0 100644 --- a/src/pydna/ligate.py +++ b/src/pydna/ligate.py @@ -31,7 +31,7 @@ def ligate(fragments: list): try: seq1 + seq2 except TypeError as err: - if str(err) != "sticky ends not compatible!": + if not str(err).startswith("sticky ends not compatible"): raise else: if seq1.seq.three_prime_end() != ( diff --git a/src/pydna/seq.py b/src/pydna/seq.py index c595aeef..2ad0f43f 100644 --- a/src/pydna/seq.py +++ b/src/pydna/seq.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ -A subclass of the Biopython SeqRecord class. +A subclass of Biopython Bio.Seq.Seq Has a number of extra methods and uses the :class:`pydna._pretty_str.pretty_str` class instread of str for a @@ -34,6 +34,10 @@ class Seq(_Seq): """docstring.""" + @property + def full_sequence(self): + return self + def translate( self, *args, @@ -154,15 +158,17 @@ def seguid(self) -> str: ---------- .. [#] http://wiki.christophchamp.com/index.php/SEGUID """ - return _lsseguid(self._data.decode("utf8").upper(), alphabet="{DNA-extended}") + return _lsseguid( + self._data.decode("ascii").upper(), alphabet="{DNA-extended},AU" + ) - def __getitem__(self, key): - result = super().__getitem__(key) - try: - result.__class__ = self.__class__ - except TypeError: - pass - return result + # def __getitem__(self, key): + # result = super().__getitem__(key) + # try: + # result.__class__ = self.__class__ + # except TypeError: + # pass + # return result def reverse_complement(self): return self.__class__(_rc(self._data)) diff --git a/src/pydna/seqrecord.py b/src/pydna/seqrecord.py index 5c727744..86054ce8 100755 --- a/src/pydna/seqrecord.py +++ b/src/pydna/seqrecord.py @@ -301,7 +301,7 @@ def add_feature( if self.circular: location = _CompoundLocation( ( - _SimpleLocation(x, self.seq.length, strand=strand), + _SimpleLocation(x, len(self.seq), strand=strand), _SimpleLocation(0, y, strand=strand), ) ) diff --git a/src/pydna/user_cloning.py b/src/pydna/user_cloning.py deleted file mode 100644 index 1402c65a..00000000 --- a/src/pydna/user_cloning.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Copyright 2013-2023 by Björn Johansson. All rights reserved. -# This code is part of the Python-dna distribution and governed by its -# license. Please see the LICENSE.txt file that should have been included -# as part of this package. - - -# from abc import ABC, abstractmethod -# import re -# from pydna.utils import rc - - -""" -Nicking & USER cloning - - -CGAuGTCGACTTAGATCTCACAGGCTTTTTTCAAGaCGGCCTTGAATTCAGTCATTTGGATCCGGCCGATC -GCTACAGCTGAATCTAGAGTGTCCGAAAAAAGTTCTGCCGGAACTTAAGTCAGTAAACCTAGGCCGGCuAG - -1. Digest both strands -2. Collect all linear ssDNA -3. Anneal all combinations -4. Keep ones present in original molecule -5. Rank by stability -6. - -""" diff --git a/src/pydna/utils.py b/src/pydna/utils.py index 21a226f6..e7fe94f3 100644 --- a/src/pydna/utils.py +++ b/src/pydna/utils.py @@ -6,17 +6,7 @@ # as part of this package. """Miscellaneous functions.""" -from Bio.Data.IUPACData import ambiguous_dna_complement as _ambiguous_dna_complement -from Bio.Seq import _maketrans - -# import shelve as _shelve -# import os as _os import re as _re - -# import logging as _logging -# import base64 as _base64 -# import pickle as _pickle -# import hashlib as _hashlib import keyword as _keyword import collections as _collections import itertools as _itertools @@ -30,7 +20,8 @@ from pydna.codon import weights as _weights from pydna.codon import rare_codons as _rare_codons - +from pydna.alphabet import basepair_dict +from pydna.alphabet import complement_table_for_dscode from Bio.SeqFeature import SimpleLocation as _sl from Bio.SeqFeature import CompoundLocation as _cl from Bio.SeqFeature import Location as _Location @@ -40,10 +31,6 @@ # For functions that take str or bytes as input and return str or bytes as output, matching the input type StrOrBytes = _TypeVar("StrOrBytes", str, bytes) -# _module_logger = _logging.getLogger("pydna." + __name__) -_ambiguous_dna_complement.update({"U": "A"}) -_complement_table = _maketrans(_ambiguous_dna_complement) - def three_frame_orfs( dna: str, @@ -210,6 +197,29 @@ def smallest_rotation(s): return s[k:] + s[:k] +def anneal_from_left(watson: str, crick: str) -> int: + """ + The length of the common prefix shared by two strings. + + Args: + str1 (str): The first string. + str2 (str): The second string. + + Returns: + int: The length of the common prefix. + """ + + result = len( + list( + _itertools.takewhile( + lambda x: basepair_dict.get((x[0], x[1])), zip(watson, crick[::-1]) + ) + ) + ) + + return result + + def cai(seq: str, organism: str = "sce", weights: dict = _weights): """docstring.""" from cai2 import CAI as _CAI @@ -276,15 +286,15 @@ def rc(sequence: StrOrBytes) -> StrOrBytes: accepts mixed DNA/RNA """ - return sequence.translate(_complement_table)[::-1] + return complement(sequence)[::-1] -def complement(sequence: str): +def complement(sequence: StrOrBytes) -> StrOrBytes: """Complement. accepts mixed DNA/RNA """ - return sequence.translate(_complement_table) + return sequence.translate(complement_table_for_dscode) # def memorize(filename): diff --git a/tests/test_USERcloning.py b/tests/test_USERcloning.py index ad389a92..a5f35708 100644 --- a/tests/test_USERcloning.py +++ b/tests/test_USERcloning.py @@ -1,73 +1,66 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import pytest +from pydna.dseq import Dseq from pydna.parsers import parse, parse_primers from pydna.amplify import pcr - +from textwrap import dedent def test_USER_cloning(): + # >a + # GUGGATT + primers = """ - >3CYC1clon - CGAUGTCGACTTAGATCTCACAGGCTTTTTTCAAG + >a + GUGGATT - >5CYC1clone - GAUCGGCCGGATCCAAATGACTGAATTCAAGGCC + >b + cUCGCCG """ template = """ - >templ - CgATGTCGACTTAGATCTCACAGGCTTTTTTCAAGaCGGCCTTGAATTCAGTCATTTGGATCCGGCCGAtC + >temp + gtGGATTaaaCGGCGag """ fp, rp = parse_primers(primers) te, *rest = parse(template) te.add_feature() - p = pcr((fp, rp, te)) + p = pcr((fp, rp, te), limit=5) + assert p.seq._data == b"GUGGATTaaaCGGCGOg" figure = p.figure() - correct_figure = """\ -5CgATGTCGACTTAGATCTCACAGGCTTTTTTCAAG...GGCCTTGAATTCAGTCATTTGGATCCGGCCGAtC3 - |||||||||||||||||||||||||||||||||| - 3CCGGAACTTAAGTCAGTAAACCTAGGCCGGCUAG5 -5CGAUGTCGACTTAGATCTCACAGGCTTTTTTCAAG3 - ||||||||||||||||||||||||||||||||||| -3GcTACAGCTGAATCTAGAGTGTCCGAAAAAAGTTC...CCGGAACTTAAGTCAGTAAACCTAGGCCGGCTaG5""" - + correct_figure = dedent( + """\ + 5gtGGATT...CGGCGag3 + ||||||| + 3GCCGCUc5 + 5GUGGATT3 + ||||||| + 3caCCTAA...GCCGCtc5""" + ) assert figure == correct_figure - assert p.seq.watson == "CGAUGTCGACTTAGATCTCACAGGCTTTTTTCAAGaCGGCCTTGAATTCAGTCATTTGGATCCGGCCGATC" - assert p.seq.crick == "GAUCGGCCGGATCCAAATGACTGAATTCAAGGCCGtCTTGAAAAAAGCCTGTGAGATCTAAGTCGACATCG" - - -# hej = p.seq - -# from Bio.SeqFeature import SeqFeature -# import re + w = "GUGGATTaaaCGGCGAg" + c = "CACCTAAtttGCCGCUc" -# wpos = [0] + [m.start() for m in re.finditer('U', hej.watson)] + [len(hej.watson)] -# cpos = [0] + [m.start() for m in re.finditer('U', hej.crick)] + [len(hej.crick)] + assert p.seq.watson == w + assert p.seq.crick == c[::-1] -# from itertools import tee + assert te.features[0] == p.features[0] + USERprocessed = p.seq.user() -# def pairwise(iterable): -# "s -> (s0,s1), (s1,s2), (s2, s3), ..." -# a, b = tee(iterable) -# next(b, None) -# return list(slice(x, y, 1) for x, y in zip(a, b)) + assert USERprocessed.melt(1) == USERprocessed.melt(12) + stuffer, insert, stuffer = USERprocessed.melt(1) -# wslices = pairwise(wpos) -# cslices = pairwise(cpos) + plasmid = Dseq.from_representation(""" + Dseq(-7) + aaaGT + Tcttt + """) -# ln = len(hej) -# for ws in wslices: -# for cs in cslices: -# if ws.stop >= ln - cs.stop: -# pass -# # print(ws, cs) -# print(ws, cs) + plasmid_insert = (plasmid + insert).looped() -if __name__ == "__main__": - pytest.main([__file__, "-vv", "-s"]) + assert plasmid_insert == Dseq("AgaaaGTGGATTaaaCGGCG", circular=True) diff --git a/tests/test_module_assembly.py b/tests/test_module_assembly.py index 946bf190..b120aba8 100644 --- a/tests/test_module_assembly.py +++ b/tests/test_module_assembly.py @@ -2,14 +2,25 @@ # -*- coding: utf-8 -*- import pytest +# from importlib import reload +from pydna import assembly +from pydna.dseqrecord import Dseqrecord +from pydna.parsers import parse +from pydna.utils import eq +from Bio.SeqFeature import SeqFeature +from Bio.SeqFeature import FeatureLocation +from Bio.SeqFeature import ExactPosition +from pydna.amplify import pcr +from pydna.readers import read +from Bio.Restriction import EcoRV, ZraI +from Bio.Restriction import AjiI, AgeI +from Bio.Restriction import SalI +from pydna.assembly import Assembly +from Bio.Restriction import AatII +from pydna._pretty import pretty_str + +def test_built(): - -def test_built(monkeypatch): - monkeypatch.setenv("pydna_cached_funcs", "") - from importlib import reload - from pydna import assembly - - reload(assembly) asm = assembly.Assembly(assembly.example_fragments, limit=5) lin = asm.assemble_linear() crc = asm.assemble_circular() @@ -18,18 +29,10 @@ def test_built(monkeypatch): assert [c.seq for c in crc] == [c.seq for c in assembly.circular_results] -def test_new_assembly(monkeypatch): - monkeypatch.setenv("pydna_cached_funcs", "") - from pydna.dseqrecord import Dseqrecord - from pydna import assembly - from pydna.parsers import parse - from pydna.utils import eq - from importlib import reload - from Bio.SeqFeature import SeqFeature - from Bio.SeqFeature import FeatureLocation - from Bio.SeqFeature import ExactPosition +def test_new_assembly(): + + - reload(assembly) # 0000000000111111111222222222233333333334444444444555555555566 # 0123456780123456789012345678901234567890123456789012345678901 @@ -214,8 +217,6 @@ def test_new_assembly(monkeypatch): c3 = assembly.Assembly((a, b, c), limit=14) assert str(c3.assemble_circular()[0].seq) == "acgatgctatactggCCCCCtgtgctgtgctctaTTTTTtattctggctgtatctGGGGGT" - from pydna.parsers import parse - from pydna.utils import eq text1 = """ >A_AgTEFp_b_631 NP+geg/4Ykv2pIwEqiLylYKPYOE @@ -248,14 +249,10 @@ def test_new_assembly(monkeypatch): # [821] [713] [132] [51] [38] -def test_assembly(monkeypatch): - monkeypatch.setenv("pydna_cached_funcs", "") - from pydna import assembly - from pydna.parsers import parse - from pydna.utils import eq - from importlib import reload +def test_assembly(): + + - reload(assembly) text1 = """ >A_AgTEFp_b_631 NP+geg/4Ykv2pIwEqiLylYKPYOE @@ -482,16 +479,10 @@ def test_assembly(monkeypatch): assert candidate.seguid() == "cdseguid=-mVwekticpAYIT9C4JcXmOGFkRo" -def test_MXblaster1(monkeypatch): - monkeypatch.setenv("pydna_cached_funcs", "") - from pydna import assembly - from pydna.parsers import parse - from pydna.amplify import pcr - from pydna.readers import read - from pydna.utils import eq - from importlib import reload +def test_MXblaster1(): + + - reload(assembly) """ test MXblaster1""" @@ -517,7 +508,6 @@ def test_MXblaster1(monkeypatch): pCAPs_pSU0 = read("pCAPs-pSU0.gb") # cut the pCAPs vectors for cloning - from Bio.Restriction import EcoRV, ZraI pCAPs_ZraI = pCAPs.linearize(ZraI) pCAPs_PCR_prod = pcr(primer[492], primer[493], pCAPs) @@ -585,7 +575,6 @@ def test_MXblaster1(monkeypatch): assert pCAPs_MX4blaster1.seguid() == "cdseguid=bUl04KTp5LpAulZX3UHdejwnuIQ" - from Bio.Restriction import AjiI, AgeI AX023560 = read("AX023560.gb") @@ -639,16 +628,11 @@ def test_MXblaster1(monkeypatch): assert pCAPs_MX4blaster2.seguid() == "cdseguid=c48cBUb3wF-Sdhzh0Tlprp-0CEg" -def test_assemble_pGUP1(monkeypatch): - monkeypatch.setenv("pydna_cached_funcs", "") +def test_assemble_pGUP1(): + + - from pydna.readers import read - from pydna import assembly - from pydna.utils import eq - from pydna.amplify import pcr - from importlib import reload - reload(assembly) GUP1rec1sens = read("GUP1rec1sens.txt") GUP1rec2AS = read("GUP1rec2AS.txt") @@ -657,7 +641,6 @@ def test_assemble_pGUP1(monkeypatch): insert = pcr(GUP1rec1sens, GUP1rec2AS, GUP1_locus) - from Bio.Restriction import SalI his3, lin_vect = pGREG505.cut(SalI) @@ -676,7 +659,7 @@ def test_assemble_pGUP1(monkeypatch): assert pGUP1.seguid() == "cdseguid=QiK2pH9yioTPfSobUTLz4CPiNzY" -# def test_35_36(monkeypatch): +# def test_35_36(): # import sys # from pydna.assembly import _od # if sys.version_info < (3, 6): @@ -686,15 +669,12 @@ def test_assemble_pGUP1(monkeypatch): # assert _od==dict -def test_pYPK7_TDH3_GAL2_PGI1(monkeypatch): - from pydna.readers import read - from pydna.assembly import Assembly +def test_pYPK7_TDH3_GAL2_PGI1(): pMEC1142 = read("pYPK0_TDH3_GAL2_PGI1.gb") pYPKp7 = read("pYPKp7.gb") - from Bio.Restriction import AatII pYPKp7_AatII = pYPKp7.linearize(AatII) @@ -703,9 +683,7 @@ def test_pYPK7_TDH3_GAL2_PGI1(monkeypatch): assert z.assemble_circular()[1].seguid() == "cdseguid=DeflrptvvS6m532WogvxQSgVKpk" -def test_marker_replacement_on_plasmid(monkeypatch): - from pydna.assembly import Assembly - from pydna.parsers import parse +def test_marker_replacement_on_plasmid(): f, r, _, _ = parse( """ @@ -724,12 +702,10 @@ def test_marker_replacement_on_plasmid(monkeypatch): """ ) - from pydna.readers import read pAG32 = read("pAG32.gb") pMEC1135 = read("pMEC1135.gb") - from pydna.amplify import pcr hygromycin_product = pcr(f, r, pAG32) @@ -741,12 +717,9 @@ def test_marker_replacement_on_plasmid(monkeypatch): assert pMEC1135.features[-1].extract(pMEC1135).seq == candidate.features[-1].extract(candidate).seq -def test_linear_with_annotations2(monkeypatch): +def test_linear_with_annotations2(): # Thanks to James Bagley for finding this bug # https://github.com/JamesBagley - from pydna._pretty import pretty_str - from pydna.assembly import Assembly - from pydna.dseqrecord import Dseqrecord a = Dseqrecord("acgatgctatactgtgCCNCCtgtgctgtgctcta") a.add_feature(0, 10, label="a_feat") @@ -790,8 +763,3 @@ def test_linear_with_annotations2(monkeypatch): # tgtgctgtgctctaTTTTTTTtattctggctgtatcCCCCCC # TATTCTGGCTGTATC # GtattctggctgtatcGGGGGtacgatgctatactgtg - - -if __name__ == "__main__": - pytest.main([__file__, "-x", "-vv", "-s"]) - # pytest.main([__file__, "-x", "-vvv", "-s", "--profile"]) diff --git a/tests/test_module_assembly2.py b/tests/test_module_assembly2.py index 3b05a548..f5454594 100644 --- a/tests/test_module_assembly2.py +++ b/tests/test_module_assembly2.py @@ -1062,7 +1062,7 @@ def test_pcr_assembly_normal(): assert str(prods[0].seq) == "TTTACGTACGTAAAAAAGCGCGCGCTTT" -@pytest.mark.xfail(reason="U in primers not handled") +# @pytest.mark.xfail(reason="U in primers not handled") def test_pcr_assembly_uracil(): primer1 = Primer("AUUA") @@ -1071,7 +1071,7 @@ def test_pcr_assembly_uracil(): seq = Dseqrecord(Dseq("aaATTAggccggTTAAaa")) asm = assembly.PCRAssembly([primer1, seq, primer2], limit=4) - assert str(asm.assemble_linear()[0].seq) == "AUUAggccggTTAA" + assert str(asm.assemble_linear()[0].seq) == "AUUAggccggTTOO" # FIXME: This is the expected result, see def in utils assert asm.assemble_linear()[0].seq.crick.startswith("UUAA") primer1 = Primer("ATAUUA") @@ -1603,19 +1603,19 @@ def test_insertion_assembly(): # Insertion of linear sequence into linear sequence (like # homologous recombination of PCR product with homology arms in genome) - a = Dseqrecord("1CGTACGCACAxxxxCGTACGCACAC2") - b = Dseqrecord("3CGTACGCACAyyyyCGTACGCACAT4") + a = Dseqrecord("CGTACGCACAbbbbCGTACGCACAC") + b = Dseqrecord("CGTACGCACArrrrCGTACGCACAT") f = assembly.Assembly([a, b], use_fragment_order=False, limit=10) # All possibilities, including the single insertions results = [ - "1CGTACGCACAyyyyCGTACGCACAxxxxCGTACGCACAC2", - "1CGTACGCACAyyyyCGTACGCACAC2", - "3CGTACGCACAxxxxCGTACGCACAyyyyCGTACGCACAT4", - "1CGTACGCACAxxxxCGTACGCACAyyyyCGTACGCACAC2", - "3CGTACGCACAxxxxCGTACGCACAT4", - "3CGTACGCACAyyyyCGTACGCACAxxxxCGTACGCACAT4", + "CGTACGCACArrrrCGTACGCACAbbbbCGTACGCACAC", + "CGTACGCACArrrrCGTACGCACAC", + "CGTACGCACAbbbbCGTACGCACArrrrCGTACGCACAT", + "CGTACGCACAbbbbCGTACGCACArrrrCGTACGCACAC", + "CGTACGCACAbbbbCGTACGCACAT", + "CGTACGCACArrrrCGTACGCACAbbbbCGTACGCACAT", ] assembly_products = [ @@ -1627,21 +1627,21 @@ def test_insertion_assembly(): # TODO: debatable whether this kind of homologous recombination should happen, or how # the overlap restrictions should be applied. - a = Dseqrecord("1CGTACGCACAxxxxC2") - b = Dseqrecord("3CGTACGCACAyyyyCGTACGCACAT4") + a = Dseqrecord("CGTACGCACAbbbbC") + b = Dseqrecord("CGTACGCACArrrrCGTACGCACAT") f = assembly.Assembly([a, b], use_fragment_order=False, limit=10) - results = ["1CGTACGCACAyyyyCGTACGCACAxxxxC2"] + results = ["CGTACGCACArrrrCGTACGCACAbbbbC"] for assem, result in zip(f.get_insertion_assemblies(), results): assert result == str(assembly.assemble([a, b], assem).seq) - a = Dseqrecord("1CGTACGCACAxxxxC2") - b = Dseqrecord("3CGTACGCACAyyyyT4") + a = Dseqrecord("CGTACGCACAbbbbC") + b = Dseqrecord("CGTACGCACArrrrT") f = assembly.Assembly([a, b], use_fragment_order=False, limit=10) assert len(f.get_insertion_assemblies()) == 0 # Does not work for circular molecules - a = Dseqrecord("1CGTACGCACAxxxxCGTACGCACAC2", circular=True) - b = Dseqrecord("3CGTACGCACAyyyyCGTACGCACAT4", circular=True) + a = Dseqrecord("CGTACGCACAbbbbCGTACGCACAC", circular=True) + b = Dseqrecord("CGTACGCACArrrrCGTACGCACAT", circular=True) assert ( assembly.Assembly( [a, b], use_fragment_order=False, limit=10 @@ -1649,8 +1649,8 @@ def test_insertion_assembly(): == [] ) - a = Dseqrecord("1CGTACGCACAxxxxC2", circular=True) - b = Dseqrecord("3CGTACGCACAyyyyCGTACGCACAT4", circular=True) + a = Dseqrecord("CGTACGCACAbbbbC", circular=True) + b = Dseqrecord("CGTACGCACArrrrCGTACGCACAT", circular=True) assert ( assembly.Assembly( [a, b], use_fragment_order=False, limit=10 @@ -1658,8 +1658,8 @@ def test_insertion_assembly(): == [] ) - a = Dseqrecord("1CGTACGCACAxxxxC2", circular=True) - b = Dseqrecord("3CGTACGCACAyyyyT4", circular=True) + a = Dseqrecord("CGTACGCACAbbbbC", circular=True) + b = Dseqrecord("CGTACGCACArrrrT", circular=True) assert ( assembly.Assembly( [a, b], use_fragment_order=False, limit=10 @@ -1752,6 +1752,7 @@ def test_assemble_function(): f2.features = [f2_feat1, f2_feat2] for shift in range(len(f1)): + f1_shifted = f1.shifted(shift) # Re-order the features so that TTT is first @@ -1839,6 +1840,7 @@ def test_assemble_function(): assembly_plan = [ (1, 2, loc_end, loc_start), ] + # FIXME: The assert below fails in the Sanity check on line 770 in assembly2, but gives the expected result. assert (fragments[0] + fragments[1]).seq == assembly.assemble( fragments, assembly_plan ).seq @@ -1848,6 +1850,7 @@ def test_assemble_function(): (1, 2, loc_end, loc_start), (2, 1, loc_end, loc_start), ] + # FIXME: The assert below fails in the Sanity check on line 770 in assembly2, but gives the expected result. assert (fragments[0] + fragments[1]).looped().seq == assembly.assemble( fragments, assembly_plan ).seq @@ -2125,7 +2128,7 @@ def test_ligation_assembly(): # Blunt ligation combined with sticky end fragments = Dseqrecord("AAAGAATTCAAA").cut(EcoRI) - result = assembly.ligation_assembly(fragments, allow_blunt=True) + result = assembly.ligation_assembly(fragments, allow_blunt=True) # FIXME: The assert below fails in the Sanity check on line 770 in assembly2, but gives the expected result. result_str = [str(x.seq) for x in result] assert sorted(result_str) == sorted(["AAAGAATTCAAA"]) assert result[0].circular @@ -2147,7 +2150,7 @@ def test_blunt_assembly(): use_fragment_order=False, ) - assert dseqrecord_list_to_dseq_list(asm.assemble_linear()) == [(b + a).seq] + assert dseqrecord_list_to_dseq_list(asm.assemble_linear()) == [(b + a).seq] # FIXME: The assert below fails in the Sanity check on line 770 in assembly2, but gives the expected result. assert asm.assemble_circular() == [] # Circular assembly @@ -2448,7 +2451,7 @@ def test_insertion_edge_case(): def test_common_sub_strings(): - a = Dseqrecord("012345", circular=True) + a = Dseqrecord("RYBDKM", circular=True) for shift_1 in range(len(a)): a_shifted = a.shifted(shift_1) for shift_2 in range(len(a)): diff --git a/tests/test_module_design.py b/tests/test_module_design.py index 704fcf9e..1857de17 100755 --- a/tests/test_module_design.py +++ b/tests/test_module_design.py @@ -368,5 +368,5 @@ def test_primer_design_correct_value(): assert str(rvs.seq) == str(correct_rvs.seq) -if __name__ == "__main__": - pytest.cmdline.main([__file__, "-v", "-s"]) +# if __name__ == "__main__": +# pytest.cmdline.main([__file__, "-v", "-s"]) diff --git a/tests/test_module_dseq.py b/tests/test_module_dseq.py index 770fe9bf..d1e40dd9 100644 --- a/tests/test_module_dseq.py +++ b/tests/test_module_dseq.py @@ -2,10 +2,21 @@ # -*- coding: utf-8 -*- import pytest +import textwrap +from pydna.dseq import Dseq +from pydna.utils import eq + +from Bio.Restriction import ( + Acc65I, ApaI, BamHI, BglII, BsaI, Bsp120I, EcoRI, EcoRV, + KpnI, MaeII, NmeDI, NotI, PacI, PstI, RestrictionBatch, TaiI +) + +from seguid import ldseguid +from seguid import cdseguid def test_cut1(): - from pydna.dseq import Dseq + """ Acc65I.search(_Seq("GGTACC")) @@ -21,7 +32,7 @@ def test_cut1(): Out [12]: [6] """ - from Bio.Restriction import Acc65I, Bsp120I, KpnI, ApaI, TaiI, MaeII + """ @@ -76,7 +87,7 @@ def test_cut1(): def test_cas9(): - from pydna.dseq import Dseq + s = Dseq("gattcatgcatgtagcttacgtagtct") @@ -86,8 +97,8 @@ def test_cas9(): def test_initialization(): - import pytest - from pydna.dseq import Dseq + + obj = Dseq(b"aaa") assert obj.crick == "ttt" @@ -156,9 +167,6 @@ def test_initialization(): def test_cut_around_and_religate(): - from pydna.dseq import Dseq - from pydna.utils import eq - from Bio.Restriction import KpnI, BamHI, Acc65I def cut_and_religate_Dseq(seq_string, enz, top): ds = Dseq(seq_string, circular=not top) @@ -190,17 +198,13 @@ def cut_and_religate_Dseq(seq_string, enz, top): ("aaaGGTACCcccGGATCCCgggGGTACCttt", [BamHI, KpnI], True), ] - for s in seqs: - print(s) - sek, enz, lin = s + for sek, enz, lin in seqs: for i in range(len(sek)): zek = sek[i:] + sek[:i] cut_and_religate_Dseq(zek, enz, lin) def test_Dseq_cutting_adding(): - from pydna.dseq import Dseq - from Bio.Restriction import BamHI, PstI, EcoRI a = Dseq( "GGATCCtcatctactatcatcgtagcgtactgatctattctgctgctcatcatcggtactctctataattatatatatatgcgcgtGGATCC", @@ -236,8 +240,6 @@ def test_Dseq_cutting_adding(): def test_dseq(): - import textwrap - from pydna.dseq import Dseq obj1 = Dseq("a", "t", circular=True) obj2 = Dseq("a", "t") @@ -251,8 +253,12 @@ def test_dseq(): with pytest.raises(TypeError): obj1 + "" - with pytest.raises(AttributeError): - obj2 + "" + + # with pytest.raises(AttributeError): + # obj2 + "" + + assert obj2 + "" == obj2 + assert "" + obj2 == obj2 obj1 = Dseq("at", "t") obj2 = Dseq("a", "t") @@ -284,20 +290,28 @@ def test_dseq(): assert obj[2:1]._data == b"ga" obj = Dseq("G", "", 0) - assert obj.five_prime_end() == ("5'", "g") + assert obj.five_prime_end() == ("single", "g") + assert obj.three_prime_end() == ("single", "g") obj = Dseq("", "C", 0) - assert obj.five_prime_end() == ("3'", "c") + assert obj.five_prime_end() == ("single", "c") + assert obj.three_prime_end() == ("single", "c") + + + obj = Dseq("ccGGATCC", "aaggatcc", -2) - assert obj._data == b"ccGGATCCtt" + # assert obj._data == b"ccGGATCCtt" + assert obj._data == b"iiGGATCCzz" assert str(obj.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-10) ccGGATCC - cctaggaa + CCTAGGaa """ ).strip() + + assert repr(obj) == rpr assert obj[3] == Dseq("G", "c", 0) @@ -343,53 +357,54 @@ def test_dseq(): assert Dseq("GGATCC", "GGATCC").t4("gat") == Dseq("ggat", "ggat", ovhg=-2) a2 = Dseq("ccGGATCCaa", "ggatcc", -2) - assert a2._data == b"ccGGATCCaa" - assert a2._data == b"ccGGATCCaa" + # assert a2._data == b"ccGGATCCaa" + assert a2._data == b"iiGGATCCee" assert str(a2.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-10) ccGGATCCaa - cctagg + CCTAGG """ ).strip() + assert repr(a2) == rpr a3 = Dseq("ccGGATCC", "ggatcc", -2) - assert a3._data == b"ccGGATCC" - assert a3._data == b"ccGGATCC" + # assert a3._data == b"ccGGATCC" + assert a3._data == b"iiGGATCC" assert str(a3.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-8) ccGGATCC - cctagg + CCTAGG """ ).strip() assert repr(a3) == rpr b = Dseq("GGATCC", "aaggatcccc", 2) - assert b._data == b"ggGGATCCtt" - assert b._data == b"ggGGATCCtt" + # assert b._data == b"ggGGATCCtt" + assert b._data == b"qqGGATCCzz" assert str(b.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-10) GGATCC - cccctaggaa + ccCCTAGGaa """ ).strip() assert repr(b) == rpr b2 = Dseq("GGATCCaa", "ggatcccc", 2) - assert b2._data == b"ggGGATCCaa" - assert b2._data == b"ggGGATCCaa" + # assert b2._data == b"ggGGATCCaa" + assert b2._data == b"qqGGATCCee" assert str(b2.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-10) GGATCCaa - cccctagg + ccCCTAGG """ ).strip() assert repr(b2) == rpr @@ -397,46 +412,46 @@ def test_dseq(): assert b2.rc().seguid() == "ldseguid=F0z-LxHZqAK3HvqQiqjM7A28daE" b3 = Dseq("GGATCC", "ggatcccc", 2) - assert b3._data == b"ggGGATCC" - assert b3._data == b"ggGGATCC" + # assert b3._data == b"ggGGATCC" + assert b3._data == b"qqGGATCC" assert str(b3.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-8) GGATCC - cccctagg + ccCCTAGG """ ).strip() assert repr(b3) == rpr c = Dseq("GGATCCaaa", "ggatcc", 0) - assert c._data == b"GGATCCaaa" - assert c._data == b"GGATCCaaa" + # assert c._data == b"GGATCCaaa" + assert c._data == b"GGATCCeee" assert str(c.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-9) GGATCCaaa - cctagg + CCTAGG """ ).strip() assert repr(c) == rpr d = Dseq("GGATCC", "aaaggatcc", 0) - assert d._data == b"GGATCCttt" - assert d._data == b"GGATCCttt" + # assert d._data == b"GGATCCttt" + assert d._data == b"GGATCCzzz" assert str(d.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-9) GGATCC - cctaggaaa + CCTAGGaaa """ ).strip() assert repr(d) == rpr obj = Dseq("GGATCCaaa", "ggatcc", 0) - from Bio.Restriction import BamHI + frag1 = Dseq("G", "gatcc", 0) frag2 = Dseq("GATCCaaa", "g", -4) @@ -470,11 +485,11 @@ def test_dseq(): obj = Dseq("tagcgtagctgtagtatgtgatctggtctaa", "CCCttagaccagatcacatactacagctacgcta") - assert repr(obj) == "Dseq(-34)\ntagc..ctaa \natcg..gattCCC" + assert repr(obj) == "Dseq(-34)\ntagc..ctaa\natcg..gattCCC" obj = Dseq("tagcgtagctgtagtatgtgatctggtctaaCCC", "ttagaccagatcacatactacagctacgcta") - assert repr(obj) == "Dseq(-34)\ntagc..ctaaCCC\natcg..gatt " + assert repr(obj) == "Dseq(-34)\ntagc..ctaaCCC\natcg..gatt" obj = Dseq("agcgtagctgtagtatgtgatctggtctaa", "ttagaccagatcacatactacagctacgcta") assert repr(obj) == "Dseq(-31)\n agcg..ctaa\natcgc..gatt" @@ -512,7 +527,7 @@ def test_dseq(): assert Dseq("tagcgtagctgtagtatgtgatctggtcta", "tagaccagatcacatactacagctacgcta").looped() == obj1 - from Bio.Restriction import BglII, BamHI + obj = Dseq("ggatcc") @@ -529,7 +544,7 @@ def test_dseq(): assert BamHI in obj.n_cutters(1) assert BamHI in obj.cutters() - from Bio.Restriction import RestrictionBatch + rb = RestrictionBatch((BamHI, BglII)) @@ -553,8 +568,6 @@ def test_dseq(): def test_Dseq_slicing(): - from pydna.dseq import Dseq - from Bio.Restriction import BamHI a = Dseq("ggatcc", "ggatcc", 0) @@ -569,8 +582,8 @@ def test_Dseq_slicing(): def test_Dseq_slicing2(): - from pydna.dseq import Dseq - from Bio.Restriction import BamHI, EcoRI, KpnI + + a = Dseq("aaGGATCCnnnnnnnnnGAATTCccc", circular=True) # TODO: address this test change Related to https://github.com/pydna-group/pydna/issues/78 @@ -588,7 +601,6 @@ def test_Dseq_slicing2(): def test_Dseq___getitem__(): # test the slicing - from pydna.dseq import Dseq s = Dseq("GGATCC", circular=False) assert s[1:-1] == Dseq("GATC", circular=False) @@ -603,6 +615,8 @@ def test_Dseq___getitem__(): assert s[9:1] == Dseq("") assert t[9:1] == Dseq("") + s[-1:9] + # Indexing of full circular molecule (https://github.com/pydna-group/pydna/issues/161) s = Dseq("GGATCC", circular=True) str_seq = str(s) @@ -611,8 +625,6 @@ def test_Dseq___getitem__(): def test_cut_circular(): - from pydna.dseq import Dseq - from Bio.Restriction import BsaI, KpnI, Acc65I, NotI test = "aaaaaaGGTACCggtctcaaaa" @@ -637,35 +649,35 @@ def test_cut_circular(): def test_repr(): - from pydna.dseq import Dseq + a = Dseq("gattcgtatgctgatcgtacgtactgaaaac") assert repr(a) == "Dseq(-31)\ngatt..aaac\nctaa..tttg" - b = Dseq("gattcgtatgctgatcgtacgtactgaaaac", "gactagcatgcatgacttttc"[::-1]) + b = Dseq("gattcgtatgctgatcgtacgtactgaaaac", "gactagcatgcatgacttttg"[::-1]) - assert repr(b) == "Dseq(-31)\ngattcgtatgctga..aaac\n gact..tttc" + assert repr(b) == "Dseq(-31)\ngattcgtatgctga..aaac\n gact..tttg" - c = Dseq("gattcgtatgctgatcgtacgtactgaaaac", "actagcatgcatgacttttc"[::-1]) + c = Dseq("gattcgtatgctgatcgtacgtactgaaaac", "actagcatgcatgacttttg"[::-1]) - assert repr(c) == "Dseq(-31)\ngatt..atgctgat..aaac\n acta..tttc" + assert repr(c) == "Dseq(-31)\ngatt..atgctgat..aaac\n acta..tttg" d = Dseq("gattcgtatgctgatcgtacg", "gactagcatgc"[::-1]) assert repr(d) == "Dseq(-21)\ngattcgtatgctgatcgtacg\n gactagcatgc" - e = Dseq("gactagcatgcatgacttttc", "gattcgtatgctgatcgtacgtactgaaaac"[::-1]) + e = Dseq("gactagcatgcatgacttttc", "gattcgtatgctgatcgtacgtactgaaaag"[::-1]) - assert repr(e) == "Dseq(-31)\n gact..tttc\ngattcgtatgctga..aaac" + assert repr(e) == "Dseq(-31)\n gact..tttc\ngattcgtatgctga..aaag" - f = Dseq("Ggactagcatgcatgacttttc", "gattcgtatgctgatcgtacgtactgaaaac"[::-1]) + f = Dseq("Cgactagcatgcatgacttttc", "gattcgtatgctgatcgtacgtactgaaaag"[::-1]) - assert repr(f) == "Dseq(-31)\n Ggac..tttc\ngattcgtatgctg..aaac" + assert repr(f) == "Dseq(-31)\n Cgac..tttc\ngattcgtatGctg..aaag" g = Dseq("gattcgtatgctgatcgtacgtactgaaaac", "ctaagcatacgactagc"[::-1]) - assert repr(g) == "Dseq(-31)\ngatt..atcgtacg..aaac\nctaa..tagc " + assert repr(g) == "Dseq(-31)\ngatt..atcgtacg..aaac\nctaa..tagc" h = Dseq("cgtatgctgatcgtacgtactgaaaac", "gcatacgactagc"[::-1]) @@ -673,15 +685,15 @@ def test_repr(): i = Dseq("cgtatgctgatcgtacgtactgaaaacagact", "gcatacgactagc"[::-1]) - assert repr(i) == "Dseq(-32)\ncgta..atcgtacg..gact\ngcat..tagc " + assert repr(i) == "Dseq(-32)\ncgta..atcgtacg..gact\ngcat..tagc" - j = Dseq("gattcgtatgctgatcgtacgtactgaaaac", "acAAGGAGAGAtg", ovhg=11) + j = Dseq("gtttcgtatgctgatcgtacgtactgaaaac", "acAAGGAGAGAtg", ovhg=11) - assert repr(j) == "Dseq(-42)\n gattcg..aaac\ngtAG..GGAAca " + assert repr(j) == "Dseq(-42)\n gtttcg..aaac\ngtAG..GGAAca" k = Dseq("g", "gattcgtatgctgatcgtacgtactgaaaac", ovhg=0) - assert repr(k) == "Dseq(-31)\ng \ncaaaa..ttag" + assert repr(k) == "Dseq(-31)\ng\ncaaaa..ttag" x = Dseq("gattcgtatgctgatcgtacgtactgaaaa") @@ -697,7 +709,7 @@ def test_repr(): def test_shifted(): - from pydna.dseq import Dseq + a = Dseq("gatc", circular=True) @@ -719,7 +731,7 @@ def test_looped(): # Looping a circular sequence should return a copy of the sequence # not the same sequence - from pydna.dseq import Dseq + a = Dseq("gatc", circular=True) @@ -728,11 +740,11 @@ def test_looped(): def test_misc(): - from pydna.dseq import Dseq + x = Dseq("ctcgGCGGCCGCcagcggccg", circular=True) - from Bio.Restriction import NotI + a, b = x.cut(NotI) @@ -742,11 +754,11 @@ def test_misc(): def test_cut_missing_enzyme(): - from pydna.dseq import Dseq + x = Dseq("ctcgGCGGCCGCcagcggccg") - from Bio.Restriction import PstI + assert x.cut(PstI) == () @@ -756,7 +768,7 @@ def test_cut_missing_enzyme(): def test_cut_with_no_enzymes(): - from pydna.dseq import Dseq + x = Dseq("ctcgGCGGCCGCcagcggccg") @@ -768,7 +780,7 @@ def test_cut_with_no_enzymes(): def test_transcribe(): - from pydna.dseq import Dseq + x = Dseq("ATGAAATAA") @@ -778,7 +790,7 @@ def test_transcribe(): def test_translate(): - from pydna.dseq import Dseq + x = Dseq("ATGAAATAA") @@ -788,7 +800,7 @@ def test_translate(): def test_from_full_sequence_and_overhangs(): - from pydna.dseq import Dseq + test_cases = [ (2, 2, "AAAA", "TTTT"), @@ -807,7 +819,7 @@ def test_from_full_sequence_and_overhangs(): def test_right_end_position(): - from pydna.dseq import Dseq + test_cases = [ ("AAA", "TT", (3, 2)), @@ -821,12 +833,12 @@ def test_right_end_position(): def test_left_end_position(): - from pydna.dseq import Dseq + test_cases = [ ("AAA", "TT", (0, 1), -1), ("AA", "TTT", (1, 0), 1), - ("AAT", "TTT", (0, 0), 0), + ("AAA", "TTT", (0, 0), 0), ] for watson, crick, expected, ovhg in test_cases: dseq = Dseq(watson, crick, ovhg=ovhg, circular=False) @@ -834,8 +846,8 @@ def test_left_end_position(): def test_apply_cut(): - from pydna.dseq import Dseq - from Bio.Restriction import EcoRI, BamHI + + seq = Dseq("aaGAATTCaa", circular=False) @@ -917,33 +929,32 @@ def test_apply_cut(): # Rotating the sequence, apply the same cut seq = Dseq("acgtATGaatt", circular=True) for shift in range(len(seq)): + seq_shifted = seq.shifted(shift) + start = 4 - shift if start < 0: start += len(seq) # Cut with negative ovhg new_cut = ((start, -3), None) out = seq_shifted.apply_cut(new_cut, new_cut) - assert str(out) == "ATGaattacgtATG" + assert out.to_blunt_string() == "ATGaattacgtATG" # Cut with positive ovhg start = (start + 3) % len(seq) new_cut = ((start, 3), None) out = seq_shifted.apply_cut(new_cut, new_cut) - assert str(out) == "ATGaattacgtATG" + assert out.to_blunt_string() == "ATGaattacgtATG" # A blunt cut start = 4 - shift new_cut = ((start, 0), None) out = seq_shifted.apply_cut(new_cut, new_cut) - assert str(out) == "ATGaattacgt" + assert out.to_blunt_string() == "ATGaattacgt" def test_cutsite_is_valid(): - from pydna.dseq import Dseq - from Bio.Restriction import EcoRI, PacI, NmeDI, EcoRV - # Works for circular case seqs = ["GAATTC", "TTAATTAAC", "GATATC"] enzs = [EcoRI, PacI, EcoRV] @@ -1007,7 +1018,7 @@ def test_cutsite_is_valid(): def test_get_cutsite_pairs(): - from pydna.dseq import Dseq + # in the test, we replace cuts by integers for clarity. @@ -1036,7 +1047,7 @@ def test_get_cutsite_pairs(): def test_get_cut_parameters(): - from pydna.dseq import Dseq + dseq = Dseq.from_full_sequence_and_overhangs("aaaACGTaaa", 3, 3) assert dseq.get_cut_parameters(None, True) == (*dseq.left_end_position(), dseq.ovhg) @@ -1079,8 +1090,8 @@ def test_get_cut_parameters(): def test_checksums(): - from seguid import ldseguid, cdseguid - from pydna.dseq import Dseq + + # AT # TA @@ -1121,5 +1132,47 @@ def test_checksums(): assert cdseguid("AACGT", "ACGTT") == truth == Dseq("AACGT", "ACGTT", circular=True).seguid() -if __name__ == "__main__": - pytest.main([__file__, "-vv", "-s"]) +def test_ovhg(): + # No overhang + assert Dseq("AAAA").ovhg == 0 + assert Dseq("AAAA", circular=True).ovhg == 0 + # Sticky ends + assert Dseq("FFAA").ovhg == 2 + assert Dseq("EEAA").ovhg == -2 + + # Sticky end on the other hang does not matter + assert Dseq("AAFF").ovhg == 0 + assert Dseq("AAEE").ovhg == 0 + + # + assert Dseq("FFAAFF").ovhg == 2 + assert Dseq("FFAAEE").ovhg == 2 + assert Dseq("EEAAEE").ovhg == -2 + assert Dseq("EEAAFF").ovhg == -2 + + # Single strand + assert Dseq("EEEE").ovhg is None + assert Dseq("FFFF").ovhg is None + + +def test_watson_ovhg(): + # No overhang + for seq in [ + "AAAA", + "AAAA", + "FFAA", + "EEAA", + "AAFF", + "AAEE", + "FFAAFF", + "FFAAEE", + "EEAAEE", + "EEAAFF", + ]: + assert ( + Dseq(seq).watson_ovhg() == Dseq(seq).reverse_complement().ovhg + ), f"error for {seq}" + + # Single strand + assert Dseq("EEEE").watson_ovhg() is None + assert Dseq("FFFF").watson_ovhg() is None diff --git a/tests/test_module_dseqrecord.py b/tests/test_module_dseqrecord.py index b3676650..cf31e1a3 100644 --- a/tests/test_module_dseqrecord.py +++ b/tests/test_module_dseqrecord.py @@ -2,10 +2,38 @@ # -*- coding: utf-8 -*- import pytest +import IPython +import sys +import copy +import warnings +import glob + +from importlib import reload +from unittest.mock import patch, mock_open, MagicMock + +from pydna import dseqrecord +from pydna.dseq import Dseq +from pydna.dseqrecord import Dseqrecord +from pydna.readers import read +from pydna.utils import eq +from pydna.utils import location_boundaries as _location_boundaries +from pydna.amplify import pcr + +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.SeqIO import read as abiread +from Bio.SeqFeature import SeqFeature, FeatureLocation, SimpleLocation +from Bio.Restriction import ( + Acc65I, ApaI, BamHI, BglII, BsaI, Bsp120I, + Bsu36I, BstAPI, EcoRI, EcoRV, KpnI, MaeII, + NlaIV, NdeI, NotI, PacI, PstI, SmaI, + RestrictionBatch +) + def test_orfs(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("AatgaaataaT") @@ -22,7 +50,7 @@ def test_orfs(): def test_cas9(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("gattcatgcatgtagcttacgtagtct") @@ -36,9 +64,9 @@ def test_cas9(): def test_FadiBakoura(): - from Bio.SeqFeature import SeqFeature, FeatureLocation - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord + + + dseq_record = Dseqrecord(Dseq("ACTCTTTCTCTCTCT", circular=True)) dseq_record.features = [SeqFeature(FeatureLocation(start=2, end=4))] @@ -47,41 +75,41 @@ def test_FadiBakoura(): def test_IPython_missing(monkeypatch): - import IPython - from pydna import dseqrecord + + # assert dseqrecord._display is IPython.display.display assert dseqrecord._display_html == IPython.display.display_html - import sys - import copy + + fakesysmodules = copy.copy(sys.modules) fakesysmodules["IPython.display"] = None monkeypatch.delitem(sys.modules, "IPython.display") monkeypatch.setattr("sys.modules", fakesysmodules) - from importlib import reload + reload(dseqrecord) - from pydna import dseqrecord + assert dseqrecord._display_html("item") == "item" # assert dseqrecord._HTML("item") == "item" def test_initialization(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - from pydna.readers import read - from Bio.Seq import Seq - from Bio.SeqRecord import SeqRecord as Srec + + + + + a = [] a.append(Dseqrecord("attt", id="1")) a.append(Dseqrecord(Dseq("attt"), id="2")) a.append(Dseqrecord(Seq("attt"), id="3")) - a.append(Dseqrecord(Srec(Seq("attt"), id="4"))) + a.append(Dseqrecord(SeqRecord(Seq("attt"), id="4"))) a.append(Dseqrecord(Dseqrecord("attt"), id="5")) a.append(Dseqrecord(Dseqrecord(Dseq("attt", circular=True)), id="6", circular=False)) @@ -98,7 +126,7 @@ def test_initialization(): a.append(Dseqrecord("attt", circular=True)) a.append(Dseqrecord(Dseq("attt"), circular=True)) a.append(Dseqrecord(Seq("attt"), circular=True)) - a.append(Dseqrecord(Srec(Seq("attt")), circular=True)) + a.append(Dseqrecord(SeqRecord(Seq("attt")), circular=True)) a.append(Dseqrecord(Dseqrecord("attt"), circular=True)) for b in a: @@ -144,6 +172,8 @@ def test_initialization(): assert dsr.circular is False assert dsr.seq.circular is False assert str(dsr.seq) == "attta" + assert dsr.seq._data == b'etttf' + assert str(dsr.seq.to_blunt_string()) == "attta" dsr = Dseqrecord(ds, circular=True) @@ -238,7 +268,7 @@ def test_initialization(): def test_linear_circular(): - from pydna.dseqrecord import Dseqrecord + """ test Dseqrecord linear & circular property""" a = Dseqrecord("attt") @@ -268,7 +298,7 @@ def test_linear_circular(): def test_stamp(): - from pydna.dseqrecord import Dseqrecord + lin = Dseqrecord("attt") lin.stamp() @@ -285,7 +315,7 @@ def test_stamp(): assert first[:42] == crc.annotations["comment"][:42] assert len(first) == len(crc.annotations["comment"]) - from pydna.dseq import Dseq + blunt = Dseqrecord(Dseq("aa")) @@ -302,7 +332,7 @@ def test_stamp(): def test_revcomp(): - from pydna.dseqrecord import Dseqrecord + # ---- # attcccgggg @@ -341,14 +371,14 @@ def test_revcomp(): def test_m(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("A" * 5000) assert f"{s.m():.3e}" == "1.544e-07" def test_extract_feature(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("tttGGATCCaaa") s.add_feature(3, 9) @@ -356,7 +386,7 @@ def test_extract_feature(): def test_find(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("tttGGATCCaaa") assert s.find(Dseqrecord("ggatcc")) == 3 @@ -366,7 +396,7 @@ def test_find(): def test_find_aa(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("tttGGATCCaaa") assert s.find_aa("FGSK") == slice(0, 12, None) @@ -374,7 +404,7 @@ def test_find_aa(): def test_str(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("tttGGATCCaaa") s.annotations = {"date": "03-JAN-2018"} @@ -390,14 +420,14 @@ def test_str(): def test___contains__(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("tttGGATCCaaa") assert "ggatcc" in s def test_seguid(): - from pydna.dseqrecord import Dseqrecord + l = Dseqrecord("tttGGATCCaaa") assert l.seguid() == "ldseguid=jbGRr-Jhpl0tVyt0Bx5nmY9_G6E" @@ -406,7 +436,7 @@ def test_seguid(): def test_format(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCC", circular=True) s.format("gb") @@ -431,9 +461,9 @@ def test_format(): def test_write(): - from unittest.mock import patch - from unittest.mock import mock_open - from pydna.dseqrecord import Dseqrecord + + + s = Dseqrecord("GGATCC", circular=True) m = mock_open() @@ -457,9 +487,9 @@ def test_write(): def test_write_same_seq_to_existing_file(monkeypatch): - from unittest.mock import patch - from unittest.mock import mock_open - from pydna.dseqrecord import Dseqrecord + + + s = Dseqrecord("Ggatcc", circular=True) @@ -471,9 +501,9 @@ def test_write_same_seq_to_existing_file(monkeypatch): def test_write_different_file_to_existing_file(monkeypatch): - from unittest.mock import patch - from unittest.mock import mock_open - from pydna.dseqrecord import Dseqrecord + + + s = Dseqrecord("Ggatcc", circular=True) d = Dseqrecord("GgatcA", circular=True) @@ -487,9 +517,9 @@ def test_write_different_file_to_existing_file(monkeypatch): def test_write_different_file_to_stamped_existing_file(monkeypatch): - from unittest.mock import patch - from unittest.mock import mock_open - from pydna.dseqrecord import Dseqrecord + + + new = Dseqrecord("Ggatcc", circular=True) new.stamp() @@ -529,9 +559,9 @@ def test_write_different_file_to_stamped_existing_file(monkeypatch): def test_write_different_file_to_stamped_existing_file2(monkeypatch): - from unittest.mock import patch - from unittest.mock import mock_open - from pydna.dseqrecord import Dseqrecord + + + new = Dseqrecord("Ggatcc", circular=True) new.stamp() @@ -582,10 +612,10 @@ def test_write_different_file_to_stamped_existing_file2(monkeypatch): # monkeypatch.setitem(readers.__builtins__, 'open', open) # def test_write_to_existing_file(): -# from unittest.mock import patch -# from unittest.mock import mock_open -# from pydna.dseqrecord import Dseqrecord -# from pydna.readers import read +# +# +# +# # # s = Dseqrecord("GGATCC", circular=True) # m = mock_open() @@ -610,10 +640,10 @@ def test_write_different_file_to_stamped_existing_file2(monkeypatch): def test_cut_args(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCC") - from Bio.Restriction import BamHI, BglII, RestrictionBatch + rb = RestrictionBatch((BamHI, BglII)) assert s.cut(BamHI)[0].seq == s.cut(BamHI + BglII)[0].seq == s.cut(rb)[0].seq @@ -621,8 +651,8 @@ def test_cut_args(): def test_cut_circular(): - from pydna.dseqrecord import Dseqrecord - from Bio.Restriction import BsaI, KpnI, Acc65I, NotI + + test = "aaaaaaGGTACCggtctcaaaa" @@ -646,10 +676,10 @@ def test_cut_circular(): def test_cut_add(): - from pydna.dseqrecord import Dseqrecord - from pydna.readers import read - from pydna.utils import eq - from Bio.Restriction import BamHI, EcoRI, PstI, SmaI + + + + a = Dseqrecord("GGATCCtcatctactatcatcgtagcgtactgatctattctgctgctcatcatcggtactctctataattatatatatatgcgcgtGGATCC").seq b = a.cut(BamHI)[1] @@ -746,7 +776,7 @@ def test_cut_add(): assert eq(pUC19_EcoRI_PstI_d, read("pUC19-EcoRI_PstI-d-rc.gb")) assert eq(pUC19_EcoRI_PstI_d.rc(), read("pUC19-EcoRI_PstI-d-rc.gb")) - from Bio.Restriction import Bsu36I, BstAPI + pCAPs = read("pCAPs.gb") a, b = pCAPs.cut(Bsu36I, BstAPI) @@ -755,8 +785,8 @@ def test_cut_add(): def test_Dseqrecord_cutting_adding_2(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord + + a = ( Dseqrecord( @@ -776,7 +806,7 @@ def test_Dseqrecord_cutting_adding_2(): ), ) - from Bio.Restriction import KpnI, Acc65I, NlaIV + enzymes = [Acc65I, NlaIV, KpnI] @@ -791,8 +821,8 @@ def test_Dseqrecord_cutting_adding_2(): def test_Dseqrecord_cutting_adding_3(): - from pydna.readers import read - from Bio.Restriction import Acc65I + + a = read( """ @@ -895,8 +925,8 @@ def test_Dseqrecord_cutting_adding_3(): def test_Dseqrecord_cutting_adding_4(): - from pydna.readers import read - from Bio.Restriction import KpnI, Acc65I, NlaIV, EcoRI, EcoRV + + a = read( """ @@ -1054,10 +1084,10 @@ def test_Dseqrecord_cutting_adding_4(): def test_features_on_slice(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - from Bio.SeqFeature import SeqFeature - from Bio.SeqFeature import SimpleLocation + + + + dseq_record = Dseqrecord(Dseq("ACTCTTTCTCTCTCT", circular=True)) dseq_record.features = [SeqFeature(SimpleLocation(start=2, end=4))] @@ -1072,10 +1102,10 @@ def test_features_on_slice(): def test_features_change_ori(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - from pydna.readers import read - from pydna.utils import eq + + + + # Shifted a sequence by zero returns a copy s = Dseqrecord("GGATCC", circular=True) @@ -1141,10 +1171,10 @@ def test_features_change_ori(): # for x in b.features[0].location.parts: # print(x.extract(b.seq)) - # from pydna.utils import shift_location - # from Bio.SeqFeature import SimpleLocation - # from Bio.SeqFeature import CompoundLocation - # from Bio.SeqFeature import ExactPosition + # + # + # + # # ml6 = s2.shifted(6).features[0].location # ml7 = s2.shifted(7).features[0].location @@ -1197,10 +1227,9 @@ def test_features_change_ori(): assert [str(f.extract(b).seq) for f in b.features if f.qualifiers["label"][0] == "ins"][0] == "GTACCTTTGGATC" assert [str(f.extract(b).seq) for f in b.features if f.qualifiers["label"][0] == "bb"][0] == "CGGGAAAG" - from Bio.Restriction import Acc65I, BamHI - inseq = Dseq.from_representation("GTACCTTTG\n" " GAAACCTAG") + inseq = Dseq.from_representation("GTACCTTTG\n" " GAAACCTAG") bbseq = Dseq.from_representation("GATCCGGGAAAG\n" " GCCCTTTCCATG") assert s3.seq.cut(Acc65I, BamHI) == (inseq, bbseq) @@ -1208,6 +1237,7 @@ def test_features_change_ori(): bb1, ins1 = sorted(s3.cut(Acc65I, BamHI), key=len, reverse=True) assert str(bbfeat).upper() in bbseq + assert str(insfeat).upper() in inseq for i in range(0, len(s3)): @@ -1236,12 +1266,12 @@ def test_features_change_ori(): assert eq(ins1, ins) assert bb.features[0].extract(bb).seq == bbfeat - assert str(ins.features[0].extract(ins).seq) == str(insfeat) + assert ins.features[0].extract(ins).seq.to_blunt_string() == str(insfeat) def test_amijalis(): # Thanks to https://github.com/amijalis - from pydna.dseqrecord import Dseqrecord + test_seq = "ATCGATCGATCGATCGATCGATCGATCGATCGATCG" @@ -1264,11 +1294,11 @@ def test_amijalis(): def test_figure(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - from Bio.Restriction import Acc65I, KpnI, ApaI, Bsp120I - from Bio.SeqFeature import SimpleLocation - from Bio.SeqFeature import SeqFeature + + + + + # broken feature linear @@ -1596,10 +1626,10 @@ def test_figure(): # @pytest.mark.xfail(reason="issue #78") def test_jan_glx(): # Thanks to https://github.com/jan-glx - from Bio.Restriction import NdeI, BamHI - from pydna.readers import read - # from pydna.genbank import Genbank + + + # # gb = Genbank("bjornjobb@gmail.com") # puc19 = gb.nucleotide("M77789.2") # assert puc19.seguid() == "n-NZfWfjHgA7wKoEBU6zfoXib_0" @@ -1619,11 +1649,11 @@ def test_jan_glx(): def test_synced(): - from Bio.Seq import Seq - from Bio.SeqRecord import SeqRecord - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - from pydna.readers import read + + + + + bb_ins = Dseqrecord("tcgcgcgtttcgATGTAgtgatgacggtgaA", circular=True) @@ -1664,7 +1694,7 @@ def test_synced(): def test_map_pCR_MCT1_HA46(): - from pydna.readers import read + pCR_MCT1_HA46 = read("pCR_MCT1_HA46.gb") @@ -1704,7 +1734,7 @@ def test_map_pCR_MCT1_HA46(): def test_map_short(): - from pydna.dseqrecord import Dseqrecord + t = Dseqrecord("AAGTTAAAATAAGGCTAGTCCGTTAT") t.map_target = slice(0, 26) @@ -1713,7 +1743,7 @@ def test_map_short(): def test_map_too_short(): - from pydna.dseqrecord import Dseqrecord + t = Dseqrecord("AAGTTAAAATAAGGCTAGTCCGTT") # t.map_target = slice(0,23) @@ -1722,7 +1752,7 @@ def test_map_too_short(): def test_map_no_match(): - from pydna.dseqrecord import Dseqrecord + t = Dseqrecord( "AGGAGGGGCCCACACCGAGGAAGTAGACTGTTATACGTCGGCGATGGTGGTAGCTAACTATGTTGCCTGCCACTACAACAGTATCTAAGCCGTGTAAAGG" @@ -1732,8 +1762,8 @@ def test_map_no_match(): def test_slicing2(): - from pydna.dseqrecord import Dseqrecord - from Bio.Restriction import BamHI, EcoRI, KpnI + + a = Dseqrecord("aaGGATCCnnnnnnnnnGAATTCccc", circular=True) assert ( @@ -1751,11 +1781,11 @@ def test_slicing2(): def test_rogerstager(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - from pydna.utils import eq - from Bio.Seq import Seq - from Bio.Restriction import BsaI + + + + + answ = [] answ.append(Dseq("aaaaaaaaaaaaggtctca", "ttttttttccagagttttt"[::-1])) @@ -1774,7 +1804,7 @@ def test_rogerstager(): def test___mul__(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCC", circular=False) assert s * 3 == Dseqrecord("GGATCCGGATCCGGATCC", circular=False) @@ -1787,7 +1817,7 @@ def test___mul__(): def test___repr__(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCC", circular=False) assert repr(s) == "Dseqrecord(-6)" @@ -1796,8 +1826,8 @@ def test___repr__(): def test__repr_pretty_(): - from unittest.mock import MagicMock - from pydna.dseqrecord import Dseqrecord + + s = Dseqrecord("GGATCC", circular=False) pp = MagicMock() @@ -1809,8 +1839,8 @@ def test__repr_pretty_(): def test___getitem__(): - from pydna.dseqrecord import Dseqrecord - from Bio.SeqFeature import SeqFeature, SimpleLocation + + s = Dseqrecord("GGATCC", circular=False) assert s[1:-1].seq == Dseqrecord("GATC", circular=False).seq @@ -1825,7 +1855,7 @@ def test___getitem__(): assert t[1:1].seq == Dseqrecord("GATCCG").seq assert t[5:1].seq == Dseqrecord("CG", circular=False).seq assert t[9:1].seq == Dseqrecord("").seq - assert t[1:9].seq == Dseqrecord("").seq + assert t[1:9].seq == Dseqrecord("GATCC").seq # FIXME: Significant change assert t[9:10].seq == Dseqrecord("").seq assert t[10:9].seq == Dseqrecord("").seq @@ -1851,7 +1881,7 @@ def test___getitem__(): def test___eq__(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCC", circular=False) t = Dseqrecord("GGATCC", circular=False) @@ -1870,7 +1900,7 @@ def test___ne__(): def test___hash__(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCC", circular=False) t = Dseqrecord("GGATCC", circular=False) @@ -1883,9 +1913,9 @@ def test___hash__(): def test_linearize(): - from Bio.Restriction import BamHI, BglII - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord + + + u = Dseqrecord("GGATCC", circular=True) frag = u.linearize(BamHI) @@ -1911,10 +1941,10 @@ def test_linearize(): def test_cutters(): - from pydna.dseqrecord import Dseqrecord + obj = Dseqrecord("ggatcc") - from Bio.Restriction import BglII, BamHI + assert BglII in obj.no_cutters() assert BamHI not in obj.no_cutters() @@ -1931,14 +1961,14 @@ def test_cutters(): def test_number_of_cuts(): - from Bio.Restriction import EcoRI, BamHI, BglII - from pydna.dseqrecord import Dseqrecord + + s = Dseqrecord("GGATCCgaattc", circular=False) s.cut(BamHI) s.cut(EcoRI) s.cut(BglII) - from Bio.Restriction import RestrictionBatch + rb = RestrictionBatch((EcoRI, BamHI, BglII)) assert s.number_of_cuts(BamHI) == 1 @@ -1949,14 +1979,14 @@ def test_number_of_cuts(): def test_reverse_complement(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCC", circular=False) assert s.reverse_complement() == s.rc() assert s.rc().seq == s.seq assert s.reverse_complement().seq == s.seq - from pydna.readers import read + s = read( """ @@ -1982,7 +2012,7 @@ def test_reverse_complement(): def test_shifted(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCCgaattc", circular=False) with pytest.raises(TypeError): @@ -1990,9 +2020,9 @@ def test_shifted(): def test_looped(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - import warnings + + + warnings.simplefilter("always") @@ -2070,7 +2100,7 @@ def test_looped(): def test_upper(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("Gc") s.annotations["sample"] = ["sample"] @@ -2083,7 +2113,7 @@ def test_upper(): def test_lower(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("Gc") s.annotations["sample"] = ["sample"] @@ -2095,10 +2125,73 @@ def test_lower(): assert l.__dict__ == s.__dict__ +# def test_map(): + + + + +# traces = [] + + + +# for name in glob.glob("*.ab1"): +# traces.append(abiread(name, "abi")) + +# for t in traces: +# d = Dseqrecord(t.seq) + +# if "ITVFFKEYPYDVPDYAIEGIFHAT" in d: +# tag = "tat cca tat gac gtt cca gac tat gca".replace(" ", "") +# trc = "ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") + +# s = Dseqrecord(Dseq(tag, trc, 0)) +# sl = s.find_aa("YPYDVPDYA") +# assert str(s[sl].seq.translate()) == "YPYDVPDYA" +# assert "YPYDVPDYA" in s + +# tag = "AAA tat cca tat gac gtt cca gac tat gca".replace(" ", "") +# trc = " ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") + +# s = Dseqrecord(Dseq(tag, trc, 0)) +# sl = s.find_aa("YPYDVPDYA") +# assert str(s[sl].seq.translate()) == "YPYDVPDYA" +# assert "YPYDVPDYA" in s + +# tag = " tat cca tat gac gtt cca gac tat gca".replace(" ", "") +# trc = "AAA ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") + +# s = Dseqrecord(Dseq(tag, trc, 0)) +# sl = s.find_aa("YPYDVPDYA") +# assert str(s[sl].seq.translate()) == "YPYDVPDYA" +# assert "YPYDVPDYA" in s + +# tag = " tat cca tat gac gtt cca gac tat gca".replace(" ", "") +# trc = "AAA ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") + +# s = Dseqrecord(Dseq(tag, trc, 0)) +# sl = s.find_aa("YPYDVPDYA") +# assert str(s[sl].seq.translate()) == "YPYDVPDYA" + +# tag = "tat cca tat gac gtt cca gac tat gca".replace(" ", "") +# trc = "ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") + +# tag, trc = trc, tag + +# s = Dseqrecord(Dseq(tag, trc, 0)) +# sl = s.rc().find_aa("YPYDVPDYA") + +# assert str(s.rc()[sl].seq.translate()) == "YPYDVPDYA" +# assert "YPYDVPDYA" in s.rc() + +# tag = "aaa tat cca tat gac gtt cca gac tat gca".replace(" ", "") +# trc = "ttt ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") + +# s = Dseqrecord(Dseq(tag, trc, 0, circular=True)) +# sl = s.find_aa("YPYDVPDYA") +# assert str(s[sl].seq.translate()) == "YPYDVPDYA" +# assert "YPYDVPDYA" in s + def test_map(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - from Bio.SeqIO import read as abiread traces = [] @@ -2111,52 +2204,29 @@ def test_map(): d = Dseqrecord(t.seq) if "ITVFFKEYPYDVPDYAIEGIFHAT" in d: - tag = "tat cca tat gac gtt cca gac tat gca".replace(" ", "") - trc = "ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") - - s = Dseqrecord(Dseq(tag, trc, 0)) + # Y P Y D V P D Y A + s = Dseqrecord(Dseq.quick("tat cca tat gac gtt cca gac tat gca".replace(" ", "").encode("ascii"))) sl = s.find_aa("YPYDVPDYA") assert str(s[sl].seq.translate()) == "YPYDVPDYA" assert "YPYDVPDYA" in s - tag = "AAA tat cca tat gac gtt cca gac tat gca".replace(" ", "") - trc = " ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") - - s = Dseqrecord(Dseq(tag, trc, 0)) + s = Dseqrecord(Dseq.quick("EEE tat cca tat gac gtt cca gac tat gca".replace(" ", "").encode("ascii"))) sl = s.find_aa("YPYDVPDYA") assert str(s[sl].seq.translate()) == "YPYDVPDYA" assert "YPYDVPDYA" in s - tag = " tat cca tat gac gtt cca gac tat gca".replace(" ", "") - trc = "AAA ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") - - s = Dseqrecord(Dseq(tag, trc, 0)) + s = Dseqrecord(Dseq.quick("FFF tat cca tat gac gtt cca gac tat gca".replace(" ", "").encode("ascii"))) sl = s.find_aa("YPYDVPDYA") assert str(s[sl].seq.translate()) == "YPYDVPDYA" assert "YPYDVPDYA" in s - tag = " tat cca tat gac gtt cca gac tat gca".replace(" ", "") - trc = "AAA ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") + s = Dseqrecord(Dseq.quick("FFF tat cca tat gac gtt cca gac tat gca".replace(" ", "").encode("ascii")).rc()) + assert s.find_aa("YPYDVPDYA") is None - s = Dseqrecord(Dseq(tag, trc, 0)) - sl = s.find_aa("YPYDVPDYA") - assert str(s[sl].seq.translate()) == "YPYDVPDYA" - - tag = "tat cca tat gac gtt cca gac tat gca".replace(" ", "") - trc = "ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") - - tag, trc = trc, tag - - s = Dseqrecord(Dseq(tag, trc, 0)) - sl = s.rc().find_aa("YPYDVPDYA") - - assert str(s.rc()[sl].seq.translate()) == "YPYDVPDYA" + assert str(s.rc()[sl].seq.translate(to_stop=False)) == "YPYDVPDYA" assert "YPYDVPDYA" in s.rc() - tag = "aaa tat cca tat gac gtt cca gac tat gca".replace(" ", "") - trc = "ttt ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") - - s = Dseqrecord(Dseq(tag, trc, 0, circular=True)) + s = Dseqrecord(Dseq.quick("aaa tat cca tat gac gtt cca gac tat gca".replace(" ", "").encode("ascii"), circular=True)) sl = s.find_aa("YPYDVPDYA") assert str(s[sl].seq.translate()) == "YPYDVPDYA" assert "YPYDVPDYA" in s @@ -2167,9 +2237,9 @@ def test_assemble_YEp24PGK_XK(): test YEp24PGK_XK """ - from pydna.readers import read - from pydna.utils import eq - from pydna.amplify import pcr + + + """ test YEp24PGK_XK""" p1 = read("primer1.txt", ds=False) @@ -2179,11 +2249,11 @@ def test_assemble_YEp24PGK_XK(): PCR_prod = pcr(p1, p3, XKS1) - from Bio.Restriction import BamHI + stuffer1, insert, stuffer2 = PCR_prod.cut(BamHI) - from Bio.Restriction import BglII + YEp24PGK_BglII = YEp24PGK.cut(BglII)[0] @@ -2207,9 +2277,9 @@ def test_assemble_YEp24PGK_XK(): def test_apply_cut(): - from pydna.dseqrecord import Dseqrecord - from Bio.SeqFeature import SeqFeature, SimpleLocation - from pydna.utils import location_boundaries as _location_boundaries + + + def find_feature_by_id(f: Dseqrecord, id: str) -> SeqFeature: return next(f for f in f.features if f.id == id) @@ -2230,7 +2300,7 @@ def find_feature_by_id(f: Dseqrecord, id: str) -> SeqFeature: open_seq = seq_shifted.apply_cut(dummy_cut, dummy_cut) assert len(open_seq.features) == 4 new_locs = sorted(str(f.location) for f in open_seq.features) - assert str(open_seq.seq) == "ATGaattacgtATG" + assert open_seq.seq.to_blunt_string() == "ATGaattacgtATG" if strand == 1: assert new_locs == sorted(["[0:3](+)", "[0:4](+)", "[11:14](+)", "[10:14](+)"]) elif strand == -1: @@ -2241,9 +2311,9 @@ def find_feature_by_id(f: Dseqrecord, id: str) -> SeqFeature: def test_apply_cut2(): - from pydna.dseqrecord import Dseqrecord - from Bio.SeqFeature import SeqFeature, SimpleLocation - from pydna.utils import location_boundaries as _location_boundaries + + + def find_feature_by_id(f: Dseqrecord, id: str) -> SeqFeature: return next(f for f in f.features if f.id == id) @@ -2264,28 +2334,10 @@ def find_feature_by_id(f: Dseqrecord, id: str) -> SeqFeature: open_seq = seq_shifted.apply_cut(dummy_cut, dummy_cut) assert len(open_seq.features) == 4 new_locs = sorted(str(f.location) for f in open_seq.features) - assert str(open_seq.seq) == "ATGaattacgtATG" + assert open_seq.seq.to_blunt_string() == "ATGaattacgtATG" if strand == 1: assert new_locs == sorted(["[0:3](+)", "[0:4](+)", "[11:14](+)", "[10:14](+)"]) elif strand == -1: assert new_locs == sorted(["[0:3](-)", "[0:4](-)", "[11:14](-)", "[10:14](-)"]) if strand is None: assert new_locs == sorted(["[0:3]", "[0:4]", "[11:14]", "[10:14]"]) - - -if __name__ == "__main__": - args = [ - __file__, - "--cov=pydna", - "--cov-append", - "--cov-report=html:../htmlcov", - "--cov-report=xml", - "--capture=no", - "--durations=10", - "--import-mode=importlib", - "--nbval", - "--current-env", - "--doctest-modules", - "--capture=no", - ] - pytest.main(args) diff --git a/tests/test_module_fusionpcr.py b/tests/test_module_fusionpcr.py index 08df0207..20d11450 100644 --- a/tests/test_module_fusionpcr.py +++ b/tests/test_module_fusionpcr.py @@ -7,6 +7,7 @@ from Bio.SeqFeature import SeqFeature, SimpleLocation from pydna.fusionpcr import fuse_by_pcr + x = Dseqrecord( Dseq("tctggtcaagctgaagggtattc"), features=[SeqFeature(SimpleLocation(5, 10, strand=1), type="misc_feature")] ) @@ -78,7 +79,7 @@ def test_fusionpcr3(): assert eq(result, r) -from Bio.SeqFeature import SeqFeature, SimpleLocation + x = Dseqrecord( Dseq("tctggtcaagctgaagggtattc"), features=[SeqFeature(SimpleLocation(5, 10, strand=1), type="misc_feature")] @@ -90,7 +91,3 @@ def test_fusionpcr3(): seqtuples = [(x, y, z)] for arg in seqtuples: result = fuse_by_pcr(arg, limit=5).pop() - - -if __name__ == "__main__": - pytest.main([__file__, "-vv", "-s"]) diff --git a/tests/test_module_ligate.py b/tests/test_module_ligate.py index 14cde155..bc037be3 100755 --- a/tests/test_module_ligate.py +++ b/tests/test_module_ligate.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- import pytest - +from pydna.ligate import ligate +from pydna.dseq import Dseq +from pydna.dseqrecord import Dseqrecord def test_ligate(): - from pydna.ligate import ligate - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord + a = Dseqrecord( Dseq.from_representation( @@ -73,7 +73,3 @@ def list_combinations(a, b): for ss in lsequences: assert ss.seguid() == "ldseguid=vBQxmszgfR4b84O0wI7d_ya9uDA" assert len(ss) == 12 - - -if __name__ == "__main__": - pytest.main([__file__, "-vv", "-s"])