Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
bef779d
Added downloadpdf.py
Sharan-Lobana Dec 3, 2015
32fd60d
Comparing IP instead of domain name
tarunz Dec 4, 2015
ef63552
Added clean version
tarunz Dec 4, 2015
3c04003
Merge branch 'master' of https://github.com/abhishekjiitr/pdf-miner
tarunz Dec 4, 2015
9068b82
Clean Scraping Code Added
abhishek-jaisingh Dec 4, 2015
63f320f
Added clean version
tarunz Dec 4, 2015
9ea37f8
File handling done in tester.py
abhishek-jaisingh Dec 5, 2015
b3371b1
link gathering almost done
abhishek-jaisingh Dec 5, 2015
9e649b9
5d
abhishek-jaisingh Dec 5, 2015
58a7482
WOrking now
tarunz Dec 6, 2015
6c4f982
Commit zaroori hai
tarunz Dec 6, 2015
e7319e9
Added initial files for name extraction
abhishek-jaisingh Dec 6, 2015
899544e
Saved websites in txt file
abhishek-jaisingh Dec 6, 2015
761c023
fixed invalid response with try catch
abhishek-jaisingh Dec 6, 2015
8114c8b
Added Stanford NER
abhishek-jaisingh Dec 9, 2015
cdc5739
Recursive pdf to xml convertor added
abhishek-jaisingh Dec 11, 2015
26eef41
Merge branch 'master' of https://github.com/abhishekjiitr/pdf-miner
abhishek-jaisingh Dec 11, 2015
2f5affc
more python files added
abhishek-jaisingh Dec 11, 2015
cf5b97f
Redefined Code
abhishek-jaisingh Aug 6, 2016
5c8f0de
More changes
abhishek-jaisingh Aug 7, 2016
b796866
Shippable Software Ready
abhishek-jaisingh Aug 7, 2016
db719da
Create README.md
abhishek-jaisingh Sep 17, 2016
99778c8
Update README.md
abhishek-jaisingh Sep 25, 2016
1b5dde9
Updated Readme
abhishek-jaisingh Sep 25, 2016
0b66929
Update README.md
abhishek-jaisingh Oct 16, 2016
6e6b3dc
Update README.md
abhishek-jaisingh Oct 29, 2016
9983e60
Merge pull request #2 from abhishekjiitr/working
abhishek-jaisingh Oct 29, 2016
b78e19e
Added regex explanation
Sharan-Lobana Oct 30, 2017
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 160 additions & 0 deletions EmailToNameMapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# from nltk import metrics

'''
Function : editDistDP

Utility function that computes the editdistance
(Levenshtein distance) between two strings
The Levenshtein distance is a metric to measure difference
between two sequences of characters

Parameters :
str1 - The first string
str2 - The second string
m - length of first string
n - length of second string

Returns :
The editdistance between two strings of length m and n respectively

'''

def editDistDP(str1, str2, m, n):
# Create a table to store results of subproblems
dp = [[0 for x in range(n+1)] for x in range(m+1)]

# Fill d[][] in bottom up manner
for i in range(m+1):
for j in range(n+1):

# If first string is empty, only option is to
# isnert all characters of second string
if i == 0:
dp[i][j] = j # Min. operations = j

# If second string is empty, only option is to
# remove all characters of second string
elif j == 0:
dp[i][j] = i # Min. operations = i

# If last characters are same, ignore last char
# and recur for remaining string
elif str1[i-1] == str2[j-1]:
dp[i][j] = dp[i-1][j-1]

# If last character are different, consider all
# possibilities and find minimum
else:
dp[i][j] = 1 + min(dp[i][j-1], # Insert
dp[i-1][j], # Remove
dp[i-1][j-1]) # Replace

return dp[m][n]

'''
Function :edit_distance

Delegates the computation of Levenshtein distance to
the function editDistDP

'''
def edit_distance(str1,str2):
return editDistDP(str1,str2,len(str1),len(str2))

'''
Function : emailToNameMapping

For each email in the emailList:
For each name in the nameList:
Compute the editdistance of email and name
if the editdistance is minimum for selected email and name pair:
given name is suitable candidate for selected email

Parameters :
emailList - List of emails extracted from a given xml file
nameList - List of possible names extracted from the same xml file

Returns :
A dictionary consisting of email as keys corresponding mapped name as values

'''
def emailToNameMapping(emailList,nameList):
dictOfEmails = dict()
name = ''
finalEmailNameMapping = dict()

for j in range(len(emailList)):
editdistance = 1000 #initialize the editdistance to a large value
mindistance = 1000 #initialize minimum editdistance for a compound
#name's(eg: Sharanpreet Singh Lobana) individual components(eg Singh)

#Filter the email for computing the editDistance by removing the digits
filteredEmail = emailList[j].split('@')[0]
filteredEmail = ''.join([str(i) for i in filteredEmail if not i.isdigit()])
# print(filteredEmail)

for n in range(len(nameList)):
if len(nameList[n].split())>1: #if name consists of more than one component(eg Rahul Kashyap)
subnames = nameList[n].split() #separate name into its components
#Remove any dots from the subnames(eg [P.,Kumar] changes to [P,Kumar])
subnames = [w.replace('.','') for w in subnames]

for m in range(len(subnames)):
if(len(subnames[m])>2): #Compute the edit distance only if the subname is not an abbrevation(eg: Dr)
distance = edit_distance(filteredEmail.lower(),subnames[m].lower())
if distance < mindistance:
#If the editDistance found is less than the mindistance so far
mindistance = distance #Update the mindistance for the subnames

if mindistance < editdistance:
# if the editdistance for given name is minimum among all the names compared so far
editdistance = mindistance # Update the minimum editdistance
name = nameList[n] #Current best match is the given name

combinedname = nameList[n].replace(' ','') #Check editdistance for the combinedname as well
combinedname = combinedname.replace('.','')#Remove all the dots in the combinedname
distance = edit_distance(filteredEmail.lower(),combinedname.lower())
if distance < editdistance:
editdistance = distance #if editDistance is less than current minimum
name = nameList[n] #Current best mapping

#Hardcoding the initials matching for len(subnames)<=3
if len(subnames) <=4 and len(subnames) >=2:
booleanList = list()
length = len(subnames)
if length == 4:
booleanList = [[0,0,0,1],[0,0,1,0],[0,0,1,1],[0,1,0,0],[0,1,0,1],[0,1,1,0],\
[0,1,1,1],[1,0,0,0],[1,0,0,1],[1,0,1,0],[1,0,1,1],[1,1,0,0],[1,1,0,1],[1,1,1,0],[1,1,1,1]]
elif length == 3:
booleanList = [[0,0,1],[0,1,0],[0,1,1],[1,0,0],[1,0,1],[1,1,0],[1,1,1]]
else:
booleanList = [[0,1],[1,0],[1,1]]
for z in range(len(booleanList)):
smallText =''
smallList = booleanList[z]
for y in range(length):
if smallList[y] == 1:
try:
smallText = smallText + subnames[y][0].lower()
except Exception as e:
print(str(e))
else:
smallText = smallText + subnames[y].lower()
distance = edit_distance(filteredEmail.lower(),smallText)
if distance < editdistance:
editdistance = distance #if editDistance is less than current minimum
name = nameList[n]
#if name contains a single character
else:
distance = edit_distance(filteredEmail.lower(),nameList[n].lower())
if distance < editdistance:
editdistance = distance
name = nameList[n] #Current best mapping

try:
finalEmailNameMapping[emailList[j]] = name #Update the mapping frequency corresponding to given name
except Exception as e:
print (str(e))

#Return the dictionary of email and name mappings
return finalEmailNameMapping
22 changes: 0 additions & 22 deletions LICENSE

This file was deleted.

75 changes: 75 additions & 0 deletions NameExtractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import nltk
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize,sent_tokenize
from PyPDF2 import PdfFileWriter,PdfFileReader
import re
import enchant

'''
Function: get_emails
removes emails from input text

Returns:
b - list of emails
modified_s - text without emails

Parameters:
s - string : reduced text with statements merged with ';'

'''

# regex is the regular expression for finding out emails
# out of streams of characters separated by whitespaces
regex = re.compile("([a-zA-Z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-zA-Z0-9!#$%&'*+\/=?^_`"
"{|}~-]+)*(@|\sat\s)(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?(\.|"
"\sdot\s))+[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?)")
def get_emails(s):
a=re.findall(regex,s)
b=list()
for a1 in a:
b.append(a1[0])
modified_s = re.sub("[^;]*?([a-zA-Z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-zA-Z0-9!#$%&'*+\/=?^_`""{|}~-]+)*(@|\sat\s)(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?(\.|""\sdot\s))+[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?).*?;",' ',s)
return b,modified_s;

'''
Function: textToNames
extracts possible names from processed input text

Returns:
list - list of possible names

Parameters:
textIn - string : reduced text with statements merged with ';'

'''

def textToNames(textIn):

listofemails,modifiedText = get_emails(textIn)
list_of_proper_nouns = []
modifiedText = modifiedText.replace(',',';')
listText = modifiedText.split(';')
# for ee in Emails:
# print ee
# print "pp", modifiedText
# textIn = textIn.replace(',',';')
# listText = textIn.split(';')
d = enchant.Dict("en_US")

nameListAmbar =[]

for text in listText:
# text = text.strip()
# if len(text)==0:
# continue
if len(text.strip())==0:
listText.remove(text)
continue
text = text.replace('\n',' ')
text = text.strip()
text_to_lower = text.lower()

nameListAmbar.append(text)

return list(set(nameListAmbar))#list_of_proper_nouns)
39 changes: 38 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,39 @@
# pdf-miner
python based crawler to mine pdfs from websites
python based crawler to mine pdfs from websites related to research works and extract useful features like author name, emails.

## Requirements:

For python3

1. pypdf2
2. pymysql
3. nltk (tokenizer)
4. pyenchant (otherwise known as pyenchant)
5. BeautifulSoup
6. urllib
7. requests
8. pdftohtml

## Database Settings:

Make a database on localhost named "authors_db".
In "authors_db" create a table named "nameemail" having the following fields:

email varchar length: 500 (make it unique to avoid duplication)
name varchar length: 500
info varchar length: 500
website varchar length: 500

```
deault credentials
user: root
password: admin123
```
## Usage:

Place domains list in finalDomain.txt
Start automate.py

```
python automate.py
```
22 changes: 22 additions & 0 deletions Requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
For python3
1. pypdf2
2. pymysql
3. nltk (tokenizer)
4. enchant (otherwise known as pyenchant)
5. BeautifulSoup
6. urllib
7. requests
8. pdftohtml
9. shutil (python3 module)

Database Settings:
Make a database on localhost named "authors_db".
In "authors_db" create a table named "nameemail" having the following fields:

email varchar length: 500 (make it unique to avoid duplication)
name varchar length: 500
info varchar length: 500
website varchar length: 500

user: root
password: admin123
32 changes: 32 additions & 0 deletions automata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os,shutil
from configuration import PDF_DOWNLOAD_DIRECTORY, PDF_FILES_BACKUP_DIRECTORY
'''
Function : move

Moves the content of one directory to another

Parameters :
src - The source directory(Default values are given for the used linux system)
des - The destination directory (Default values are given for use linux system)

Returns :
Void
'''
def moving(src=PDF_DOWNLOAD_DIRECTORY,des=PDF_FILES_BACKUP_DIRECTORY):
i=0
for root, dirnames, filenames in os.walk(src):
for direc in dirnames:
try:
#Utility module to cut and paste files from one directory to another
shutil.move(os.path.join(root,direc), des)

except OSError as e:
print(str(e))
#If error occurs due to clash of names,
try:
pass
os.system("rm -r " + src + "/*")
# print(src)
except OSError as e:
print(str(e))

Loading