forked from sohaib-ahsan/forward-reverse-indexing
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMain.py
More file actions
64 lines (53 loc) · 1.64 KB
/
Main.py
File metadata and controls
64 lines (53 loc) · 1.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# IMPORTING THE REQUIRED LIBRARIES
import nltk
import json
from os import listdir
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from os.path import isfile, join
# DEFINING THE FUNCTION TO GET THE KEY OF THE DICTIONARY
def get_key(val):
for key, value in mydict.items():
for i in value:
if i == val:
keys.append(key)
# DEFINING THE PATH TO THE JSON FILES
path = r"E:\Elementry Search Engine\Backend\Dataset\nela-gt-2021\newsdata"
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
# DOWNLOADING THE NLTK LIBRARIES
nltk.download('stopwords')
stemmer = PorterStemmer()
# DEFINING THE VARIABLES
articleCounter = 0
filecounter = 0
mydict = {}
mydict2 = {}
docID = 0
keys = []
documentID = []
unwords = [',', ';', '“', '”']
# APPENDING THE UNECESSARY WORDS TO BE REMOVED FROM THE ARTICLES
for i in range(33, 65):
unwords.append(chr(i))
for i in range(123, 128):
unwords.append(chr(i))
for i in range(91, 97):
unwords.append(chr(i))
# PARSING THE JSON FILES
for file in onlyfiles:
with open(r"E:\Elementry Search Engine\Backend\Dataset\nela-gt-2021\newsdata\\" + file, "r") as file:
data = json.load(file)
print(file)
filecounter += 1
stems = []
# DEFINING THE LIMIT OF THE NUMBER OF ARTICLES TO BE PARSED
if articleCounter >= 100000:
break
else:
docID += 1
documentID.append("documentID" + str(docID))
for dict in data:
articleCounter += 1
tokenized_text = word_tokenize(dict['content'])
stop_words = set(stopwords.words("english"))