-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnltk_tests.py
More file actions
31 lines (25 loc) · 874 Bytes
/
nltk_tests.py
File metadata and controls
31 lines (25 loc) · 874 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import nltk
from nltk import word_tokenize
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
from gutenberg.query import get_etexts
from gutenberg.query import get_metadata
import sys
for x in range (1000, 20000):
try:
text = strip_headers(load_etext(x)).strip()
#print(text) # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...'
author = get_metadata('author', x)
print author
author_name = list(author)
author_name = str(author_name[0])
author_name = author_name.split(',')
author_name_clean = str(author_name[0]).strip() + str(author_name[1]).strip()
author_name_clean = author_name_clean.replace(" ", "")
print author_name_clean
filename = "books/"+ author_name_clean + ".txt"
fo = open(filename, "w")
fo.write(str(text))
fo.close()
except:
print("Unexpected error:", sys.exc_info()[0])