Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions Dockerfile

This file was deleted.

5 changes: 2 additions & 3 deletions benchmark.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
#! /usr/bin/env python
#! /usr/bin/env python3
import scraperwiki
import os
from six.moves import range

rows = [{'id': i, 'test': i * 2, 's': "abc"} for i in range(1000)]

try:
os.remove('scraperwiki.sqlite')
except OSError:
except FileNotFoundError:
pass

scraperwiki.sql.save(['id'], rows)
Expand Down
8 changes: 3 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ authors = [
{ name = "ScraperWiki Developers", email = "hello@scraperwiki.com" }
]
license = { text = "GPLv3+" }
readme = "README.md" # Assumes you have a README file
readme = "README.rst"
urls = { Repository = "https://github.com/cantabular/scraperwiki-python" }
classifiers = [
"Intended Audience :: Developers",
Expand All @@ -25,10 +25,8 @@ classifiers = [
"Topic :: Database :: Front-Ends",
]
dependencies = [
"requests==2.32.5",
"six==1.17.0",
"sqlalchemy==2.0.45",
"alembic==1.17.2",
"sqlalchemy>=2,<3",
"alembic",
]

[tool.setuptools.packages.find]
Expand Down
3 changes: 1 addition & 2 deletions save_speedtest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#! /usr/bin/env python
#! /usr/bin/env python3
import scraperwiki
from six.moves import range

rows = [{'id': i, 'test': i * 2, 's': "xx"*i} for i in range(10)]

Expand Down
3 changes: 0 additions & 3 deletions scraperwiki/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
#!/usr/bin/env python
# Thomas Levine, ScraperWiki Limited

'''
Local version of ScraperWiki Utils, documentation here:
https://scraperwiki.com/docs/python/python_help_documentation/
Expand Down
21 changes: 5 additions & 16 deletions scraperwiki/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,23 @@

import alembic.ddl
import sqlalchemy
import six

DATABASE_NAME = os.environ.get("SCRAPERWIKI_DATABASE_NAME",
"sqlite:///scraperwiki.sqlite")

DATABASE_TIMEOUT = float(os.environ.get("SCRAPERWIKI_DATABASE_TIMEOUT", 300))
SECONDS_BETWEEN_COMMIT = 2
unicode = str

# The scraperwiki.sqlite.SqliteError exception
SqliteError = sqlalchemy.exc.SQLAlchemyError

class Blob(bytes):

"""
Represents a blob as a string.
"""
pass

PYTHON_SQLITE_TYPE_MAP = {
str: sqlalchemy.types.Text,
str: sqlalchemy.types.Text,
int: sqlalchemy.types.BigInteger,
bool: sqlalchemy.types.Boolean,
Expand All @@ -37,19 +35,10 @@ class Blob(bytes):
datetime.date: sqlalchemy.types.Date,
datetime.datetime: sqlalchemy.types.DateTime,

bytes: sqlalchemy.types.LargeBinary,
Blob: sqlalchemy.types.LargeBinary,
}

if bytes is not str:
# On 2.7, bytes *is* str, so we don't want to overwrite that.
PYTHON_SQLITE_TYPE_MAP[bytes] = sqlalchemy.types.LargeBinary

try:
PYTHON_SQLITE_TYPE_MAP[long] = sqlalchemy.types.BigInteger
except NameError:
pass


class _State:

"""
Expand Down Expand Up @@ -183,7 +172,7 @@ def select(query, data=None):

rows = []
for row in result:
rows.append(dict(list(row._mapping.items())))
rows.append(dict(row._mapping))

return rows

Expand Down Expand Up @@ -268,7 +257,7 @@ def save_var(name, value):
if column_type == sqlalchemy.types.LargeBinary:
value_blob = value
else:
value_blob = unicode(value).encode('utf-8')
value_blob = str(value).encode('utf-8')

values = dict(name=name,
value_blob=value_blob,
Expand Down
57 changes: 23 additions & 34 deletions scraperwiki/utils.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,53 @@
#!/usr/bin/env python
# utils.py
# David Jones, ScraperWiki Limited
# Thomas Levine, ScraperWiki Limited

'''
Local version of ScraperWiki Utils, documentation here:
https://scraperwiki.com/docs/python/python_help_documentation/
'''
import os
import shutil
import sys
import warnings
import tempfile
import six.moves.urllib.parse
import six.moves.urllib.request
import requests
import urllib.parse
import urllib.request


def scrape(url, params=None, user_agent=None):
'''
Scrape a URL optionally with parameters.
This is effectively a wrapper around urllib2.urlopen.
This is effectively a wrapper around urllib.request.urlopen.
'''

headers = {}

if user_agent:
headers['User-Agent'] = user_agent

data = params and six.moves.urllib.parse.urlencode(params) or None
req = six.moves.urllib.request.Request(url, data=data, headers=headers)
f = six.moves.urllib.request.urlopen(req)
data = None
if params:
data = urllib.parse.urlencode(params).encode('utf-8')

req = urllib.request.Request(url, data=data, headers=headers)

text = f.read()
f.close()
with urllib.request.urlopen(req) as f:
text = f.read()

return text


def pdftoxml(pdfdata, options=""):
"""converts pdf file to xml file"""
if not shutil.which('pdftohtml'):
warnings.warn(
'scraperwiki.pdftoxml requires pdftohtml, but pdftohtml was not found '
'in the PATH. If you wish to use this function, you probably need to '
'install pdftohtml.'
)
return None
pdffout = tempfile.NamedTemporaryFile(suffix='.pdf')
pdffout.write(pdfdata)
pdffout.flush()

xmlin = tempfile.NamedTemporaryFile(mode='r', suffix='.xml')
xmlin = tempfile.NamedTemporaryFile(mode='r', suffix='.xml', encoding="utf-8")
tmpxml = xmlin.name # "temph.xml"
cmd = 'pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes {} "{}" "{}"'.format(
options, pdffout.name, os.path.splitext(tmpxml)[0])
Expand All @@ -55,29 +59,14 @@ def pdftoxml(pdfdata, options=""):
#xmlfin = open(tmpxml)
xmldata = xmlin.read()
xmlin.close()
return xmldata.decode('utf-8')


def _in_box():
return os.environ.get('HOME', None) == '/home'
return xmldata


def status(type, message=None):
assert type in ['ok', 'error']

# if not running in a ScraperWiki platform box, silently do nothing
if not _in_box():
return "Not in box"

url = os.environ.get("SW_STATUS_URL", "https://app.quickcode.io/api/status")
if url == "OFF":
# For development mode
return
"""Retained for backwards compatibility."""
warnings.warn("status() is no longer in use following ScraperWiki/Quickcode application shutdown", DeprecationWarning, stacklevel=2)
return

# send status update to the box
r = requests.post(url, data={'type': type, 'message': message})
r.raise_for_status()
return r.content

def swimport(scrapername):
return __import__(scrapername)
27 changes: 10 additions & 17 deletions tests.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/env python
import datetime
import json
import os
Expand All @@ -13,7 +12,6 @@
from unittest import TestCase, main

import scraperwiki
import six

import sys
# scraperwiki.sql._State.echo = True
Expand Down Expand Up @@ -64,9 +62,9 @@ def test_date(self):
date1 = datetime.datetime.now()
date2 = datetime.date.today()
scraperwiki.sql.save_var(u"weird\u1234", date1)
self.assertEqual(scraperwiki.sql.get_var(u"weird\u1234"), six.text_type(date1))
self.assertEqual(scraperwiki.sql.get_var(u"weird\u1234"), str(date1))
scraperwiki.sql.save_var(u"weird\u1234", date2)
self.assertEqual(scraperwiki.sql.get_var(u"weird\u1234"), six.text_type(date2))
self.assertEqual(scraperwiki.sql.get_var(u"weird\u1234"), str(date2))

def test_save_multiple_values(self):
scraperwiki.sql.save_var(u'foo\xc3', u'hello')
Expand Down Expand Up @@ -94,7 +92,7 @@ def test_insert(self):
""")
((colname, value, _type),) = self.cursor.fetchall()
expected = [(u"birthday\xfe", u"\u1234November 30, 1888", "text",)]
observed = [(colname, type(b'')(value).decode('utf-8'), _type)]
observed = [(colname, value.decode('utf-8'), _type)]
self.assertEqual(observed, expected)

class SaveAndCheck(TestCase):
Expand Down Expand Up @@ -269,7 +267,7 @@ def test_lxml_string(self):
self.save_and_check(
{"text": s},
"lxml",
[(six.text_type(s),)]
[(str(s),)]
)

def test_save_and_drop(self):
Expand Down Expand Up @@ -320,7 +318,7 @@ def test_save_date(self):
scraperwiki.sql.select("* FROM swdata"))

self.assertEqual(
{u'keys': [u'birthday\xaa'], u'data': [(six.text_type(d),)]},
{u'keys': [u'birthday\xaa'], u'data': [(str(d),)]},
scraperwiki.sql.execute("SELECT * FROM swdata"))

self.assertEqual(str(d), self.rawdate(column=u"birthday\xaa"))
Expand All @@ -331,7 +329,7 @@ def test_save_datetime(self):
scraperwiki.sql.save([], {"birthday": d},
table_name="datetimetest")

exemplar = six.text_type(d)
exemplar = str(d)
# SQLAlchemy appears to convert with extended precision.
exemplar += ".000000"

Expand All @@ -347,13 +345,11 @@ def test_save_datetime(self):
class TestStatus(TestCase):
'Test that the status endpoint works.'

def test_does_nothing_if_called_outside_box(self):
scraperwiki.status('ok')

def test_raises_exception_with_invalid_type_field(self):
self.assertRaises(AssertionError, scraperwiki.status, 'hello')
def test_status(self):
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=DeprecationWarning)

# XXX neeed some mocking tests for case of run inside a box
self.assertEqual(scraperwiki.status('ok'), None)

class TestUnicodeColumns(TestCase):
maxDiff = None
Expand Down Expand Up @@ -385,6 +381,3 @@ def test_import_scraperwiki_utils(self):

def test_import_scraperwiki_special_utils(self):
self.sw.pdftoxml

if __name__ == '__main__':
main()
Loading