@@ -168,7 +168,8 @@ def amazon_products_count_num_tokens(df_path):
168168 from transformers import AutoTokenizer
169169 tokenizer = AutoTokenizer .from_pretrained ('TheBloke/Llama-2-13B-fp16' )
170170 sum_is : int = sum (
171- [len (input_ids ) for input_ids in tokenizer (ser_part .tolist (), add_special_tokens = False )['input_ids' ]])
171+ [len (input_ids ) for input_ids in tokenizer (ser_part .tolist (), add_special_tokens = False )['input_ids' ]]
172+ )
172173 return sum_is
173174
174175 counts : List [int ] = accumulate ([
@@ -333,8 +334,85 @@ def create_realnews():
333334 write_realnews_partition (realnews_dominant , corpus_dir = corpus_dir , rn_name = 'realnews-dominant' )
334335
335336
337+ def create_cmu_movies ():
338+ corpus_dir : FileMetadata = FileMetadata .of (
339+ f'{ CORPUS_DIR } /data/cmu_movies/'
340+ )
341+ corpus_dir .mkdir ()
342+ if len (corpus_dir .list ()) == 0 :
343+ raise SystemError (
344+ f'Expected CMU Movies to be in folder "{ corpus_dir .path } ". '
345+ f'Please download the data from https://www.cs.cmu.edu/~ark/personas/ and extract it. '
346+ f'You should get the folder "MovieSummaries".'
347+ )
348+ with Timer ('Reading and merging plot_summaries.txt and movie.metadata.tsv' ):
349+ movie_plots : pd .DataFrame = pd .read_csv (
350+ corpus_dir .subdir_in_dir ('MovieSummaries' , return_metadata = True ).file_in_dir ('plot_summaries.txt' ),
351+ sep = '\t ' ,
352+ header = None ,
353+ names = [
354+ 'wiki_movie_id' ,
355+ 'plot_summary' ,
356+ ]
357+ )
358+ movie_meta : pd .DataFrame = pd .read_csv (
359+ corpus_dir .subdir_in_dir ('MovieSummaries' , return_metadata = True ).file_in_dir ('movie.metadata.tsv' ),
360+ sep = '\t ' ,
361+ header = None ,
362+ names = [
363+ 'wiki_movie_id' ,
364+ 'freebase_movie_id' ,
365+ 'title' ,
366+ 'release_date' ,
367+ 'box_office_revenue' ,
368+ 'runtime' ,
369+ 'languages' ,
370+ 'countries' ,
371+ 'genres'
372+ ]
373+ )
374+ movies : pd .DataFrame = movie_meta .merge (
375+ movie_plots , on = 'wiki_movie_id'
376+ ).reset_index (drop = True ).rename (
377+ columns = dict (plot_summary = 'text' , wiki_movie_id = 'idx' )
378+ )
379+ corpus_raw_text_dir : FileMetadata = corpus_dir .subdir_in_dir ('raw-text' , return_metadata = True )
380+ movies .to_parquet (corpus_raw_text_dir .file_in_dir (f'cmu-movie-summary.parquet' ))
381+ print (f'Done creating CMU Moveis corpus, final data is at: "{ corpus_raw_text_dir .path } "' )
382+
383+ def cmu_movies_count_num_tokens (df_path ):
384+ df_part = Reader .of (
385+ 'parquet' ,
386+ data_schema = {
387+ 'wiki_movie_id' : 'index' ,
388+ 'plot_summary' : 'text' ,
389+ }
390+ ).read (df_path , raw = True )
391+ ser_part = df_part ['plot_summary' ]
392+ from transformers import AutoTokenizer
393+ tokenizer = AutoTokenizer .from_pretrained ('TheBloke/Llama-2-13B-fp16' )
394+ sum_is : int = sum (
395+ [len (input_ids ) for input_ids in tokenizer (ser_part .tolist (), add_special_tokens = False )['input_ids' ]]
396+ )
397+ return sum_is
398+
399+ counts : List [int ] = accumulate ([
400+ run_parallel_ray (
401+ cmu_movies_count_num_tokens ,
402+ df_path = df_path ,
403+ )
404+ for df_path in FileMetadata .of (
405+ corpus_raw_text_dir .path ,
406+ file_glob = '*.parquet' ,
407+ ).list ()
408+ ], progress = True )
409+ print (f'CMU Movies corpus has { round (sum (counts ) / 1e6 , 2 )} million tokens' )
410+
411+
336412if __name__ == '__main__' :
337413 create_amazon_products ()
338414 gc .collect ()
339415 create_realnews ()
340416 gc .collect ()
417+ create_cmu_movies ()
418+ gc .collect ()
0 commit comments