CameraParser/camParser.py at master · vikrant1998/CameraParser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
--------------------------------------------------------------------------------
Descriptive Name     : camParser.py
Author               : Vikrant Satheesh Kumar
Contact Info         : vsathees@purdue.edu
Date Written         : 1/21/2017
Description          : Parse data on NYC dot website
Command to run script: python camParser.py
Usage                : N/A
Input file format    : N/A
Output               : N/A
Note                 :
Other files required by : This code requires PhantomJS, a headless web browser.
this script and where
located

----For Parsing Scripts---------------------------------------------------------
Website Parsed       : www.dotsignals.org
In database (Y/N)    : Y
Date added to Database : N/A
--------------------------------------------------------------------------------
"""
from bs4 import BeautifulSoup
import urllib2
import re
import sys
import time
import json
from selenium import webdriver
import platform

def nycdot():
    print ("NYC dot")

    JSonURL = "http://dotsignals.org/new-data.php" # JSON File containing MAP data.
    CameraPopupURL = "http://dotsignals.org/google_popup.php?cid=" # Camera URL access.

    if platform.system() == 'Windows':
        PHANTOMJS_PATH = './phantomjs.exe'
    else:
        PHANTOMJS_PATH = '/usr/local/bin/phantomjs'

    browser = webdriver.PhantomJS(PHANTOMJS_PATH)

    # Create a file to store the camera info.
    f = open ('nycdot_list', 'w') # Write to an output file.

    #Header Info
    f.write ("description#snapshot_url#latitude#longitude#country#city\n")

    #load JSON file into the response to parse.
    response = urllib2.urlopen(JSonURL).read()

    #Parse the given response.
    parsed_json = json.loads(response)

    #get the markers to help parse.
    cameras = parsed_json ['markers']

    for camera in cameras:
        cam_id    = camera['id']
        content   = camera['content']
        latitude  = camera['latitude']
        longitude = camera['longitude']
        url       = CameraPopupURL + cam_id

        browser.get(url)
        soup = BeautifulSoup(browser.page_source)

        # Finding the image tags using BeautifulSoup
        snapshot_url = soup.find('img').get('src')

        # Checking if the camera is inactive.
        if re.search (r'img/inactive', snapshot_url) == None:
            snapshot_url = re.search(r'(?P<URL>[\w\.\/:\\]*)', snapshot_url).group('URL')
            f.write (content+"#"+str(snapshot_url)+"#"+latitude+"#"+longitude+"#"+"USA#NY#New York\n")

        print (snapshot_url)
        pass

    f.close()
    return

if __name__ == '__main__':
    print ("Getting started")
    nycdot()