word2vec_study/datapreprocessing.py at master · chmodsss/word2vec_study · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import nltk
import codecs
import collections
import hashlib
from nltk import sent_tokenize
from string import punctuation
import multiprocessing as mp
from multiprocessing import pool
from itertools import izip_longest

global del_words, punct, default_stopwords, cores, block, swear_words

del_words = '@'
punct = punctuation.replace('#', '').replace('\\', '')
default_stopwords = nltk.corpus.stopwords.words('german')
cores = 48
block = 10000
swear_words = []

def grouper(n, iterable, padvalue = None):
    return izip_longest(fillvalue = padvalue, *[
        iter(iterable)] * n)


def removeSpecialCharsWorker(line):
    if line:
        line = line.rstrip()
        if 'http' not in line:
            translated_phrase = line.encode('utf-8').translate(None, punct)
            words_list = [word for word in translated_phrase.split() if (not word.startswith(del_words)) and (not word.isdigit()) and (word not in default_stopwords)]
            if words_list:
                    return ' '.join(words_list).lower()
    return None


def removeSpecialChars(target_file):
    with codecs.open(target_file, 'r', 'utf-8') as infile:
        read_data = infile.read().splitlines()
    (base, ext) = os.path.splitext(target_file)
    write_file = codecs.open(target_file.rstrip(ext) + '_c' + ext, 'w', 'utf-8')
    p = mp.Pool(cores)
    for chunk in grouper(block, read_data):
        results = p.map(removeSpecialCharsWorker, chunk)
        for r in results:
            if r:
                write_file.write(r)
                write_file.write('\n')

    p.close()
    p.join()
    write_file.close()
    return write_file.name


def removeObscenityWorker(line):
    if line:
        line = line.rstrip()
        if not any([word for word in line.split() if word in swear_words]):
            return line
        return None

def removeObscenity(target_file, swear_file):
    with codecs.open(target_file, 'r', 'utf-8') as infile:
        read_data = infile.read().splitlines()
    (base, ext) = os.path.splitext(target_file)
    write_file = codecs.open(target_file.rstrip(ext) + '_o' + ext, 'w', 'utf-8')
    with codecs.open(swear_file, 'r', 'utf-8') as infile:
        raw_swear_words = infile.read().splitlines()
    tag_swear_words = [ '#' + word for word in raw_swear_words ]
    global swear_words
    swear_words = raw_swear_words + tag_swear_words
    p = mp.Pool(cores)
    for chunk in grouper(block, read_data):
        results = p.map(removeObscenityWorker, chunk)
        for r in results:
            if r:
                write_file.write(r)
                write_file.write('\n')
                continue

    p.close()
    p.join()
    write_file.close()
    return write_file.name


class MySentences(object):

    def __init__(self, file_name):
        self.file_name = file_name


    def __iter__(self):
        for lines in codecs.open(self.file_name, 'r', 'utf-8'):
            yield lines


def removeDuplicates(target_file):
    hash_table = collections.defaultdict(list)
    (base, ext) = os.path.splitext(target_file)
    write_file = codecs.open(target_file.rstrip(ext) + '_d' + ext, 'w', 'utf-8')
    sentences = MySentences(target_file)
    for line in sentences:
        if line:
            line = line.rstrip()
            id = hashlib.sha512(line).digest()
            key = id[0:2]
            value = id[2:]
            if value not in hash_table[key]:
                write_file.write(line)
                write_file.write('\n')
                hash_table[key].append(value)

    write_file.close()
    return write_file.name