-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtmp.py
More file actions
171 lines (130 loc) · 5.63 KB
/
tmp.py
File metadata and controls
171 lines (130 loc) · 5.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import pandas as pd
from genome_wide_screen import MpraScreen
from datasets import load_full_project_data
from constants import PROCESSED_DIR
import os
import pandas as pd
import numpy as np
import io, subprocess
from tqdm import tqdm
# from sklearn.utils import resample
from sklearn.preprocessing import OneHotEncoder
from utils.bigwig_utils import pull_roadmap_features, compile_roadmap_features
from constants import *
from pathlib import Path
def make_output_csv(project, split, raw_score, calib_score):
df = pd.read_csv(PROCESSED_DIR / project / f'matrix_{split}.csv',
usecols=[0, 1, 2])
ref = df[['chr', 'pos', 'Label']].copy()
ref['raw_score'] = raw_score
ref['calib_score'] = calib_score
ref.to_csv(
PROCESSED_DIR / 'analysis' / f'{project}_{split}_scores.csv',
index=False
)
def get_random_bed(n_samples=100000):
""" Generate random bedfile corresponding to n_samples random locations
in the genome. Used for picking locations to extract unlabeled
data for semi-supervised learning.
Offset reduces the range of indices per chromosome to sample, for example
when edges are not available.
"""
total_range = sum([x for x in GenRange.values()])
chrs = []
samples = []
for chrom in range(1, 23):
select = int(n_samples * GenRange[chrom] / total_range)
bps = np.random.randint(low=0, high=GenRange[chrom], size=select)
samples.append(bps)
chrs.extend([chrom] * select)
samples = np.concatenate(samples)
chrs = np.array(chrs)
bed = pd.DataFrame({'chr': chrs,
'pos': samples,
'pos_end': samples+1,
'rs': ['ul{0}'.format(x+1) for x in range(len(samples))]})
return bed
def extract_mpra_screen(bedfile, outpath):
""" Extract MPRA Screen predictions using an input bed table.
The bed file should be a pandas DataFrame with the following format:
chr (int): chromosome
pos (int): position of interest
variant (str): (optional) name or rsid assigned to variant for naming
The extracted data is saved as a tsv to the specified outpath.
Note: currently only supports hg19 scores and extracts one position per row.
"""
if os.path.exists(outpath):
os.remove(outpath)
SCORE_DIR = Path('/oak/stanford/groups/zihuai/fredlu/processed/genomeScreen/scores')
SCORE_BASE = 'mpra_screen_chr_XX.tsv.gz'
# load column header names
head_file = SCORE_DIR / 'header.txt'
with open(head_file, 'r') as f:
header = f.read()
header = header.strip('\n').split('\t')
with open(outpath, 'w') as fp:
fp.write('\t'.join(header) + '\n')
for row in tqdm(bedfile.itertuples()):
# make system command
command = f'tabix {SCORE_DIR}/{SCORE_BASE} '
args = f'{row.chr}:{row.pos}-{row.pos}'
full_cmd = command.replace('XX', str(row.chr)) + args
p = subprocess.Popen(full_cmd, shell=True, stdout=subprocess.PIPE)
stdout, _ = p.communicate()
if len(stdout) > 0:
# read query string
query = io.StringIO(stdout.decode('utf-8'))
features = pd.read_csv(query, sep=r'\s+', header=None,
index_col=False, na_values='.')
# remove tabix end-position overlapping with query position
features = features[features[1] == row.pos]
features = features.astype(str).values.tolist()
for fr in features:
fp.write('\t'.join(fr) + '\n')
if __name__ == '__main__':
# screen = MpraScreen()
# # mpra_nova + 1kg_background train set
# print('nova train')
# X, X_neighbor = load_full_project_data('final_1kg', split='train')
# X = screen.preprocess(X, X_neighbor)
# raw_score = screen.predict(X)
# calib_score = screen.predict_calibrate(X)
# make_output_csv('final_1kg', 'train', raw_score, calib_score)
# # mpra_nova + 1kg_background test set
# print('nova test')
# X, X_neighbor = load_full_project_data('final_1kg', split='test')
# X = screen.preprocess(X, X_neighbor)
# raw_score = screen.predict(X)
# calib_score = screen.predict_calibrate(X)
# make_output_csv('final_1kg', 'test', raw_score, calib_score)
# # random 1kg_background set
# print('background all')
# X, X_neighbor = load_full_project_data('1kg_background4', split='all')
# X = screen.preprocess(X, X_neighbor)
# raw_score = screen.predict(X)
# calib_score = screen.predict_calibrate(X)
# make_output_csv('1kg_background4', 'all', raw_score, calib_score)
# from genome_wide_screen import sample_for_calibrator
# sample_for_calibrator(100000)
import numpy as np
import matplotlib.pyplot as plt
from utils.bed_utils import get_random_bed
bed = get_random_bed(n_samples=20000)
print(bed.head())
extract_mpra_screen(bed, './tmp.tsv')
# sample_vals = []
# for chrom in range(1, 23):
# df = pd.read_csv(PROCESSED_DIR / 'genomeScreen' / f'tmp_{chrom}.csv', sep='\t')
# sample_vals.append(df.dropna().iloc[:, 4].values)
# df = pd.read_csv('./tmp.tsv', sep='\t')
# all_sample_vals = np.concatenate(sample_vals)
# print(all_sample_vals.shape)
# plt.hist(df.MPRA_raw, bins=20)
# plt.savefig('./raw_score_hist.png')
# plt.close()
# plt.hist(df.MPRA_score, bins=20)
# plt.savefig('./calibrated_score_hist.png')
# plt.close()
# plt.hist(df.MPRA_PHRED, bins=20)
# plt.savefig('./phred_score_hist.png')
# plt.close()