Skip to content

Commit 207ca8d

Browse files
Added steps to create CMU movies corpus.
1 parent cb0c7dc commit 207ca8d

File tree

1 file changed

+79
-1
lines changed

1 file changed

+79
-1
lines changed

src/synthesizrr/expt/corpus.py

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,8 @@ def amazon_products_count_num_tokens(df_path):
168168
from transformers import AutoTokenizer
169169
tokenizer = AutoTokenizer.from_pretrained('TheBloke/Llama-2-13B-fp16')
170170
sum_is: int = sum(
171-
[len(input_ids) for input_ids in tokenizer(ser_part.tolist(), add_special_tokens=False)['input_ids']])
171+
[len(input_ids) for input_ids in tokenizer(ser_part.tolist(), add_special_tokens=False)['input_ids']]
172+
)
172173
return sum_is
173174

174175
counts: List[int] = accumulate([
@@ -333,8 +334,85 @@ def create_realnews():
333334
write_realnews_partition(realnews_dominant, corpus_dir=corpus_dir, rn_name='realnews-dominant')
334335

335336

337+
def create_cmu_movies():
338+
corpus_dir: FileMetadata = FileMetadata.of(
339+
f'{CORPUS_DIR}/data/cmu_movies/'
340+
)
341+
corpus_dir.mkdir()
342+
if len(corpus_dir.list()) == 0:
343+
raise SystemError(
344+
f'Expected CMU Movies to be in folder "{corpus_dir.path}". '
345+
f'Please download the data from https://www.cs.cmu.edu/~ark/personas/ and extract it. '
346+
f'You should get the folder "MovieSummaries".'
347+
)
348+
with Timer('Reading and merging plot_summaries.txt and movie.metadata.tsv'):
349+
movie_plots: pd.DataFrame = pd.read_csv(
350+
corpus_dir.subdir_in_dir('MovieSummaries', return_metadata=True).file_in_dir('plot_summaries.txt'),
351+
sep='\t',
352+
header=None,
353+
names=[
354+
'wiki_movie_id',
355+
'plot_summary',
356+
]
357+
)
358+
movie_meta: pd.DataFrame = pd.read_csv(
359+
corpus_dir.subdir_in_dir('MovieSummaries', return_metadata=True).file_in_dir('movie.metadata.tsv'),
360+
sep='\t',
361+
header=None,
362+
names=[
363+
'wiki_movie_id',
364+
'freebase_movie_id',
365+
'title',
366+
'release_date',
367+
'box_office_revenue',
368+
'runtime',
369+
'languages',
370+
'countries',
371+
'genres'
372+
]
373+
)
374+
movies: pd.DataFrame = movie_meta.merge(
375+
movie_plots, on='wiki_movie_id'
376+
).reset_index(drop=True).rename(
377+
columns=dict(plot_summary='text', wiki_movie_id='idx')
378+
)
379+
corpus_raw_text_dir: FileMetadata = corpus_dir.subdir_in_dir('raw-text', return_metadata=True)
380+
movies.to_parquet(corpus_raw_text_dir.file_in_dir(f'cmu-movie-summary.parquet'))
381+
print(f'Done creating CMU Moveis corpus, final data is at: "{corpus_raw_text_dir.path}"')
382+
383+
def cmu_movies_count_num_tokens(df_path):
384+
df_part = Reader.of(
385+
'parquet',
386+
data_schema={
387+
'wiki_movie_id': 'index',
388+
'plot_summary': 'text',
389+
}
390+
).read(df_path, raw=True)
391+
ser_part = df_part['plot_summary']
392+
from transformers import AutoTokenizer
393+
tokenizer = AutoTokenizer.from_pretrained('TheBloke/Llama-2-13B-fp16')
394+
sum_is: int = sum(
395+
[len(input_ids) for input_ids in tokenizer(ser_part.tolist(), add_special_tokens=False)['input_ids']]
396+
)
397+
return sum_is
398+
399+
counts: List[int] = accumulate([
400+
run_parallel_ray(
401+
cmu_movies_count_num_tokens,
402+
df_path=df_path,
403+
)
404+
for df_path in FileMetadata.of(
405+
corpus_raw_text_dir.path,
406+
file_glob='*.parquet',
407+
).list()
408+
], progress=True)
409+
print(f'CMU Movies corpus has {round(sum(counts) / 1e6, 2)} million tokens')
410+
411+
336412
if __name__ == '__main__':
337413
create_amazon_products()
338414
gc.collect()
339415
create_realnews()
340416
gc.collect()
417+
create_cmu_movies()
418+
gc.collect()

0 commit comments

Comments
 (0)