-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchange.py
More file actions
122 lines (102 loc) · 3.84 KB
/
change.py
File metadata and controls
122 lines (102 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#-*- coding:unicode_escape -*-
import sys
import os
import re
import jieba
import codecs
import json
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import pickle as pickle
#reload(sys)
#sys.setdefaultencoding('utf-8')
jieba.enable_parallel(32)
'''ifstream = open("test.json", "r")
ofstream = open("test-utf-8", "w")
#while 1:
for i in range(0, 10):
tmp = ifstream.readlines()
# if tmp == "":
# break
n = len(tmp)
for j in range(n):
tmp[j] = tmp[j][:-1]
tmp[j] = tmp[j].encode('latin-1').decode('unicode_escape')
ofstream.write(tmp[j] + '\n')
ofstream.close()'''
input_file = "test.json"
output_file = "test-jieba"
stopword_path = "stopwords.dat"
wordbag_path = "test_word.dat"
stopword=[]
def change(input_file, output_file):
fin = open(input_file, "r")
fout = open(output_file, "w")
stop_fin = open("stopwords.txt", "r")
stop_text = stop_fin.readlines()
n = len(stop_text)
for i in range(n):
stopword.append(stop_text[i][:-1])#.encode('unicode_escape').decode('utf-8')[:-1])
bunch_obj = Bunch(id=[], content=[], label=[])
for i in range(1):
text_prepared = []
tem = json.loads(fin.readline())
ce = re.compile(u'[^\u4E00-\u9FA5]+')
text_content = tem["content"]
text_content = re.sub(ce, "", text_content, 0)
cut = jieba.cut(text_content)
fout.write("id: " + tem["id"] + "\\n")
bunch_obj.id.append(tem["id"])
for s in cut:
#flag = 0
#print(s)
if s in stopword:
#flag = 1
continue
text_prepared.append(s)
s = s + "\\n"
fout.write(s)
bunch_obj.content = text_prepared.copy()
#bunch_obj.content.append(text_prepared)
with open(wordbag_path, "wb") as file_obj:
pickle.dump(bunch_obj, file_obj)
fout.close()
def _writebunchobj(path, bunchobj):
with open(path, "wb") as file_obj:
pickle.dump(bunchobj, file_obj)
def _readfile(path):
with open(path, "rb") as fp:
content = fp.read()
return content
def _readbunchobj(path):
with open(path, "rb") as file_obj:
bunch = Bunch(id = [])#pickle.load(file_obj)
return bunch
def vector_space(stopword,bunch_path,space_path,train_tfidf_path = None):
stpwrdlst = _readfile(stopword_path).splitlines()
bunch = _readbunchobj(bunch_path)
tfidfspace = Bunch(id = bunch.id.copy(), content = bunch.content.copy(), tdm = [], vocabulary = {})
if train_tfidf_path is not None:
trainbunch = _readbunchobj(train_tfidf_path)
tfidfspace.vocabulary = trainbunch.vocabulary
vectorizer = TfidfVectorizer(stop_words = stpwrdlst, sublinear_tf = True, max_df = 0.5, vocabulary = trainbunch.vocabulary)
tfidfspace.tdm = vectorizer.fit_transform(bunch.content)
else:
vectorizer = TfidfVectorizer(stop_words = stpwrdlst, sublinear_tf = True, max_df = 0.5)
tfidfspace.tdm = vectorizer.fit_transform(bunch.content)
tfidfspace.vocabulary = vectorizer.vocabulary_
_writebunchobj(space_path, tfidfspace)
def main():
change(input_file, output_file)
vector_space(stopword, "train.json", "tfidf")
trainpath = "tfdifspace.dat"
train_set = _readbunchobj(trainpath)
testpath = "test.json"
test_set = _readbunchobj(testpath)
clf = MultinomialNB(alpha=0.001).fit(train_set.tdm, train_set.label)
predicted = clf.predict(test_set.tdm)
#for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):
# if flabel != expct_cate:
# print file_name,": 实际类别:",flabel," -->预测类别:",expct_cate
main()