Collaborative_knowledge_engineering_Wikidata/process_Wikidata_dumps.py at main · ElisavetK/Collaborative_knowledge_engineering_Wikidata · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# This scipt process the xml dumps. We use threading to read the files in parallel. The function extraction2 reads the xml dump and
# extractes the pages for items talk pages and proeprty talk pages (namespace 1 and 121 respectively). The script creates a csv file for every page.

import xml.etree.ElementTree as etree
from bz2 import BZ2File
import pandas as pd
import json
import codecs
import csv
import os
import re
import logging
import threading
import concurrent.futures
import tracemalloc


def extraction2(file_name):
    prefix = '{http://www.mediawiki.org/xml/export-0.10/}'
    p_title = prefix + 'title'
    p_ns = prefix + 'ns'
    p_id = prefix + 'id'
    p_timestamp = prefix + 'timestamp'
    p_page = prefix + 'page'
    p_revision = prefix + 'revision'
    p_username = prefix + 'username'
    p_ip = prefix + 'ip'
    p_text = prefix + 'text'


    p_list = []

    c_page = None
    c_revision = None
    c_username = None
    c_timestamp = None
    c_title = None
    c_ns = None
    c_text = None
    c_ip = None
    revisions = []
    user_edits_timestamps={}


    # We keep a stack of 'breadcrumbs', i.e. all open elements to this point
    path = []
    with BZ2File(str(PATH_read)+str(file_name)) as xml_file:

        # open the file to save discussions' text
        hfile = codecs.open(str(PATH_save)+str(file_name[:-4])+'_DiscussionPage.csv', 'w', encoding='utf-8')
        hfilecsv = csv.writer(hfile)
        hfilecsv.writerow(['page_title','namespace', 'text'])

        # displaying the memory
        print(tracemalloc.get_traced_memory())

        for event, elem in etree.iterparse(xml_file, events=('start','end')):
            if event == 'start':
                path.append(elem.tag)
            elif event == 'end':
                if elem.tag in p_list: # Printing all elements we are interested in in p_list
                    # print(path)
                    # print(elem.tag, elem.text)
                    continue
                # page info
                if elem.tag == p_title and p_page in path: # store the page title
                    c_title = elem.text
                if elem.tag == p_ns and p_page in path: # store the page namespace
                    c_ns = elem.text
                    # print(c_ns)
                #revision info
                if elem.tag == p_id and p_revision in path: # Store revision ids
                    c_revision = elem.text
                if elem.tag == p_timestamp: # Store revision timestamp, always after revision id
                    c_timestamp = elem.text
                if elem.tag == p_ip: # store editor's name
                    c_ip = elem.text
                if elem.tag == p_username: # store editor's name
                    c_username = elem.text
                if elem.tag == p_text: # store revision text
                    c_text = elem.text
                    # store usernames in revisions with the timestamp
                    if c_username is None and c_ip:# check if c_username is None and istead the re is c_ip
                        c_username = c_ip
                    if c_username is None and c_ip is None:
                        c_username = 'username_and_ip_are_None'

                    if c_username in user_edits_timestamps.keys(): # check if the username exist in the dictionary
                        user_edits_timestamps[c_username].append(c_timestamp) # if username exists store the timestamp of the revision
                    else: user_edits_timestamps[c_username]=[c_timestamp] # if username does not exist add it as key and store the timestamp

                    if c_ns in ['1','121']: # list of the discussion page namespaces
                        revisions.append((c_revision, c_timestamp, c_username, c_text))#store revisions

                if elem.tag == p_page: # End of a page, get most recent revision text
                    if c_ns in ['1','121']: # list of the discussion page namespaces
                        revisions_sorted = sorted(revisions, key=lambda tup: tup[1], reverse=True) # Order list of tuples by second field (timestamp), descending order
                        hfilecsv.writerow([c_title,c_ns,revisions_sorted[0][3]])

                    revisions = []

                path.pop()


#===========================
#threading process


ENCODING = "utf-8"

PATH_read='Dumps/'
PATH_save='Discussion_files/'
start_file=0
end_file=1


if not os.path.exists(PATH_save):
        os.mkdir(PATH_save)


def thread_function(name):
    logging.info("Thread %s: starting", name)
    extraction2(name)
    logging.info("Thread %s: finishing", name)


if __name__ == "__main__":
    format = "%(asctime)s: %(message)s"
    logging.basicConfig(format=format, level=logging.INFO,
                        datefmt="%H:%M:%S")

    with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
        executor.map(thread_function, [f'dump_{i}.bz2' for i in range(start_file,end_file)])