-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
83 lines (63 loc) · 2.61 KB
/
scraper.py
File metadata and controls
83 lines (63 loc) · 2.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from facebook_scraper import get_posts
from datetime import datetime
import locale
import re
import csv
locale.setlocale(locale.LC_TIME, "de_DE")
pagename = 'eismanufakturzeitgeist'
# get latest facebook post by grabbing the first item of the generator returned
# by get_posts
post = next(get_posts(pagename, pages=1))
def scrap():
# handle date of post
# assumption: ice cream flavour are posted the same day
date = post['time'] # grab date
# TODO: if the post is from today skip repeated checks
if date.strftime('%Y-%m-%d') == datetime.today().strftime('%Y-%m-%d'):
print("Post is from today")
date = date.strftime('%A, %d.%m.%Y') # format datetime object
# get post text (String)
post_text = post['text']
post_url = post['post_url']
raw_text = post_text
# assumption: vanilla and chocolate are always on offer
# so if the post does not contain vanilla and chocolate it is not about ice
# cream flavours
if "Vanille" and "Schokolade" not in post_text:
print("No ice cream update.")
else:
# remove everything before Lindenhof and after up until the :
# TODO: This breaks if Limburgerhof is posted first
regex = re.compile(r'.*?Lindenhof.*?:', re.DOTALL)
post_text = re.sub(regex, 'Lindenhof', post_text)
# split on linebreaks
post_text = re.split('[ ]*\n+[ ]*', post_text.strip())
# remove last sentence
if "schmecken" in post_text[-1]:
del post_text[-1]
regex = re.compile(r'.*?Limburgerhof.*?:', re.DOTALL)
for i, x in enumerate(post_text):
if 'Limburgerhof' in x:
# remove everything before Limburgerhof and after up until the :
post_text[i] = re.sub(regex, 'Limburgerhof', x)
# split on ',' & 'und'
for i, x in enumerate(post_text):
post_text[i] = re.split(' und |[,]', x)
# change only ice cream flavours located at 1 and 3 at list
if i % 2 != 0:
for j, y in enumerate(post_text[i]):
post_text[i][j] = y.lstrip() # string are imutable
# remove locations
post_text.pop(0)
post_text.pop(1)
# export csv
#toCSV(date,post_text)
return date,post_text,raw_text,post_url
def toCSV(date,post_text):
with open('../csv/' + date + '.csv', 'w', newline='') as file:
writer = csv.writer(file, delimiter=',',)
# write header and remove entries from list
writer.writerow([*post_text.pop(0), *post_text.pop(1)])
zipped_list = zip(*post_text)
for x in zipped_list:
writer.writerow(x)