diff --git a/CHANGELOG.md b/CHANGELOG.md index 303530a57..0713c5e75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ -# v0.1.0 +# Change Log + +All notable changes to this project will be documented in this file. + +## v0.1.1 + +Added more informative error messages if an FDR distribution cannot be made or there is not enough coverage. + +## v0.1.0 First major release of the FIRE pipeline. This release includes a refactor to reduce the computation by increased use of ft, changes to the output file names to include the fire version among other things, and finally a new launching method for the pipeline that uses pixi. Results are very similar to v0.0.7 of the pipeline; however, there are minor differences in the peak calls and the output names. diff --git a/pixi.toml b/pixi.toml index 8b017bf87..83111f3d2 100644 --- a/pixi.toml +++ b/pixi.toml @@ -4,7 +4,7 @@ channels = ["conda-forge", "bioconda"] description = "Add a short description here" name = "FIRE" platforms = ["osx-64", "linux-64"] -version = "0.1.0" +version = "0.1.1" [tasks] fmt = "ruff format . && taplo format pixi.toml && snakefmt workflow/" diff --git a/workflow/scripts/cov.py b/workflow/scripts/cov.py index 4c69c48c4..0754ef914 100644 --- a/workflow/scripts/cov.py +++ b/workflow/scripts/cov.py @@ -76,10 +76,12 @@ def polars_read(): print(f"\nmean coverage: {mean}", file=sys.stderr) print(f"median coverage: {coverage}\n", file=sys.stderr) -if coverage <= 1: +if coverage < 5: raise ValueError( - f"Median coverage is {coverage}! Did you use the correct reference, or is data missing from most of your genome. If so consider the keep_chromosomes parameter in config.yaml" + f"Median coverage is {coverage}! Did you use the correct reference, or is data missing from most of your genome. We recommend at least 10x coverage to use FIRE and require at least 5x." + "If you are only examining data from a subset of chromosomes, consider using the keep_chromosomes parameter in config.yaml" ) + open(snakemake.output.cov, "w").write(str(round(coverage)) + "\n") open(snakemake.output.minimum, "w").write(str(round(min_coverage)) + "\n") open(snakemake.output.maximum, "w").write(str(round(max_coverage)) + "\n") diff --git a/workflow/scripts/fdr-table.py b/workflow/scripts/fdr-table.py index 8ec3f023a..c78d9fb53 100644 --- a/workflow/scripts/fdr-table.py +++ b/workflow/scripts/fdr-table.py @@ -8,6 +8,7 @@ import numpy as np import polars.selectors as cs import gzip +import sys # from numba import njit ROLLING_FIRE_SCORE_WINDOW_SIZE = 200 @@ -25,12 +26,26 @@ def find_nearest(array, value): return idx +def my_read_csv(*args, **kwargs): + try: + result = pl.read_csv(*args, **kwargs) + # do some transformation with the dataframe + except pl.exceptions.NoDataError as e: + print( + "No data is found in the input file. Check the input file and make sure it is not empty. It is likely that the input data was not generated correctly or that it was impossible to find peaks at the specified FDR value.", + file=sys.stderr, + ) + print(e, file=sys.stderr) + sys.exit(1) + return result + + # ['#chrom', 'start', 'end', 'coverage', 'fire_coverage', 'score', 'nuc_coverage', 'msp_coverage', # 'coverage_H1', 'fire_coverage_H1', 'score_H1', 'nuc_coverage_H1', 'msp_coverage_H1', # 'coverage_H2', 'fire_coverage_H2', 'score_H2', 'nuc_coverage_H2', 'msp_coverage_H2'] def read_pileup_file(infile, nrows): # get the header from the first line of the file - header = pl.read_csv(infile, separator="\t", n_rows=1).columns + header = my_read_csv(infile, separator="\t", n_rows=1).columns # check that there is at least two lines open_infile = gzip.open if is_gzipped(infile) else open @@ -51,7 +66,7 @@ def read_pileup_file(infile, nrows): logging.info(f"Schema overrides for the pileup file:\n{schema_overrides}") # read the file - pileup = pl.read_csv( + pileup = my_read_csv( infile, separator="\t", has_header=False, @@ -132,7 +147,7 @@ def fdr_table_from_scores(fire_scores): return results -def make_fdr_table(infile, outfile, nrows, max_cov=None, min_cov=None): +def make_fdr_table(infile, outfile, nrows, max_cov=None, min_cov=None, max_fdr=0.05): # read the pileup file pileup = read_pileup_file(infile, nrows) # filter on coverages if needed @@ -172,11 +187,16 @@ def make_fdr_table(infile, outfile, nrows, max_cov=None, min_cov=None): logging.info(f"Done aggregating pileup file:\n{fire_scores}") fdr_table = fdr_table_from_scores(fire_scores) fdr_table.to_csv(outfile, sep="\t", index=False) + # raise an error if no threshold below 0.05 is found + if fdr_table["FDR"].min() > max_fdr: + raise ValueError( + f"No FIRE score threshold has an FDR < {max_fdr}. Check the input Fiber-seq data with the QC pipeline and make sure you are using WGS Fiber-seq data." + ) return fdr_table def read_fdr_table(infile): - fdr_table = pl.read_csv(infile, separator="\t").to_pandas() + fdr_table = my_read_csv(infile, separator="\t").to_pandas() logging.info(f"Read FDR table:\n{fdr_table}") return fdr_table @@ -283,6 +303,7 @@ def main( nrows: Optional[int] = None, max_cov: Optional[int] = None, min_cov: Optional[int] = None, + max_fdr: float = 0.05, verbose: int = 0, ): """ @@ -303,7 +324,7 @@ def main( apply_fdr_table(infile, outfile, fdr_table, nrows) else: fdr_table = make_fdr_table( - infile, outfile, nrows, min_cov=min_cov, max_cov=max_cov + infile, outfile, nrows, min_cov=min_cov, max_cov=max_cov, max_fdr=max_fdr ) return 0