-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTweetParser.py
More file actions
119 lines (77 loc) · 2.44 KB
/
TweetParser.py
File metadata and controls
119 lines (77 loc) · 2.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/python
import simplejson as json
import codecs
import os
import bz2
from optparse import OptionParser
def parseTweet(jsondata):
try:
dict = json.loads(jsondata)
if 'text' in dict.keys():
""" gets some attributes from tweet
createdAt
username(screen_name)
tweet text
we could get other attibutes like:
id
timestamp
username(name)
...
"""
createdAt = dict['created_at']
screenName = dict['user']['screen_name']
tweetText = dict['text'].replace('\n',' ').replace('\t',' ').replace('\r',' ')
output = '%s\t\t%s\t\t%s' % (createdAt, screenName, tweetText)
print output.encode('utf8')
except:
None
print "fail"
def parseDir(inputDir):
total = 0
for path, subdirs, files in os.walk(inputDir):
for archive in files:
filepath = os.path.join(path, archive)
if archive.endswith(".json.bz2"):
total += parseFile_bz2(filepath ,archive)
elif archive.endswith(".json"):
total += parseFile_json(filepath ,archive)
print "Total of parsed tweets: %d" % (total)
def parseFile(inputFile):
try:
if inputFile.endswith(".json"):
parseFile_json(inputFile, inputFile)
elif inputFile.endswith(".json.bz2"):
parseFile_bz2(inputFile, inputFile)
except:
print "Error: the file doesn`t exists or the type of the file is not json or bz2."
def parseFile_bz2(filepath, archive):
count = 0
with bz2.BZ2File(filepath,'r') as fin:
for line in fin:
parseTweet(str(line).decode('utf-8'))
count += 1
print "*%s: parsed | %d tweets*" % (archive, count)
print '-----------------------------------------------------------------------'
return count
def parseFile_json(filepath, archive):
count = 0
for line in file(filepath):
parseTweet(line)
count += 1
print "*%s: parsed | %d tweets*" % (archive, count)
print '-----------------------------------------------------------------------'
return count
if __name__ == '__main__':
# The program gives options to the user.
parser = OptionParser()
parser.add_option("-d", "--directory", dest="dir",
help="Directory containing all files", metavar="FOLDER")
parser.add_option("-f", "--file", dest="file",
help="A single file", metavar="FILE")
(options, args) = parser.parse_args()
inputFile = options.file
inputDir = options.dir
if inputDir == None:
parseFile(inputFile)
else:
parseDir(inputDir)