csvToolbox/reduceCSVToFindPattern.py at master · jeremiahmarks/csvToolbox · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: jeremiah.marks
# @Date:   2015-06-15 17:46:35
# @Last Modified 2015-06-29
# @Last Modified time: 2015-06-29 20:21:46

# This script will open a csv file and go through it
# cell-by-cell, row-by-row and count the number of characters
# in each cell.  If the cell is empty, it will put a zero.
# If there is content in the cell, it will put a 1
#  This can then be opened in excel using conditional formatting
# to get a color coded, visual representation of where data is in the file.


import csv

infilename='products.csv'
outfilename='productsBin.csv'
def leavefieldnames():
    with open(infilename) as infile:
        reader=csv.DictReader(infile, restkey="valuesOutsideOfTable")
        with open(outfilename, 'wb') as outfile:
            writer=csv.DictWriter(outfile, reader.fieldnames + ["valuesOutsideOfTable"])
            writer.writeheader()
            for eachrow in reader:
                tempthing={}
                for eachcol in eachrow.keys():
                    if len(eachrow[eachcol])>0:
                        tempthing[eachcol]="1"
                    else:
                        tempthing[eachcol]="0"
                writer.writerow(tempthing)
def replacefieldnames():
    with open(infilename) as infile:
        reader=csv.DictReader(infile, restkey="valuesOutsideOfTable")
        allfields=reader.fieldnames + ["valuesOutsideOfTable"]
        for fieldloc, fieldname in enumerate(allfields):
            allfields[fieldloc]=fieldloc
        with open(outfilename, 'wb') as outfile:
            writer=csv.DictWriter(outfile, allfields, restval='0')
            writer.writeheader()
            for eachrow in reader:
                tempthing={}
                for eachloc, eachcol in enumerate(eachrow.keys()):
                    if len(eachrow[eachcol])>0:
                        tempthing[eachloc]="1"
                    else:
                        tempthing[eachloc]="0"
                writer.writerow(tempthing)

def checkrowtypes():
    import csv
    infilename='./droppings/products.csv'
    outfilename='./droppings/productsBin.csv'
    with open(infilename) as infile:
        reader=csv.DictReader(infile, restkey="valuesOutsideOfTable")
        allfields=reader.fieldnames + ["valuesOutsideOfTable"]
        fieldsasnums=allfields[:]
        for fieldloc, fieldname in enumerate(fieldsasnums):
            fieldsasnums[fieldloc]=fieldloc
        with open(outfilename, 'wb') as outfile:
            rowtypes=set() # This will be used to ensure that
            # we keep track of unique sets.
            filekeeper={} # This will hold various file process
            # that each rowtype will need.
            names=set()
            patternnames={}
            writer=csv.DictWriter(outfile, fieldsasnums, restval='0')
            writer.writeheader()
            ataglancefile=open('./droppings/ataglance.csv.txt', 'w+')
            filekeeper['ataglance']=ataglancefile
            writerkeeper={}
            for eachrow in reader:
                linenum=reader.line_num
                tempthing={}
                for eachloc, eachcol in enumerate(eachrow.keys()):
                    if len(eachrow[eachcol])>0:
                        tempthing[eachloc]="1"
                    else:
                        tempthing[eachloc]="0"
                if fieldsasnums[-1] not in tempthing.keys():
                    tempthing[fieldsasnums[-1]]="0"
                thisPattern=''.join([tempthing[k] for k in fieldsasnums])
                if thisPattern not in rowtypes:
                    rowtypes.add(thisPattern)
                    filekeeper[thisPattern]=open("./droppings/" + thisPattern + ".csv", 'wb')
                    thiswriter=csv.DictWriter(filekeeper[thisPattern], allfields, restval='0')
                    thiswriter.writeheader()
                    writerkeeper[thisPattern]=thiswriter
                    print "\nName this type of row: \n"
                    print eachrow
                    patternname=raw_input("Please use something like product, option, or contact\n").strip(" \n")
                    if patternname not in names:
                        names.add(patternname)
                        filekeeper[patternname]=open("./droppings/" + patternname + ".csv", 'wb')
                        thispatternwriter=csv.DictWriter(filekeeper[patternname], allfields, restval='0')
                        thispatternwriter.writeheader()
                        writerkeeper[patternname]=thispatternwriter
                    patternnames[thisPattern]=patternname
                writer.writerow(tempthing)
                writerkeeper[patternnames[thisPattern]].writerow(eachrow)
                writerkeeper[thisPattern].writerow(eachrow)
                ataglancefile.write(str(linenum) + "," + patternnames[thisPattern] + "," + thisPattern + "\n")
    return [k.close() for k in filekeeper.values()]


def expandcells():
    """This method will add whitespace to every cell until
    every cell in the column is the same width.
    """
    import csv
    infilename='./droppings/products.csv'
    outfilename='./droppings/productsBin.csv'
    fillchar=' '
    longestcell={}
    with open(infilename) as infile:
        reader=csv.DictReader(infile)
        for eachrow in reader:
            for eachcell in reader.fieldnames:
                celllength=len(eachcell)
                if eachcell not in longestcell.keys():
                    longestcell[eachcell]=celllength
                if celllength > longestcell[eachcell]:
                    longestcell[eachcell]=celllength
    with open(infilename) as infile:
        reader=csv.DictReader(infile)
        with open(outfilename, 'wb') as outfile:
            thesefieldnames=reader.fieldnames
            for nameloc, afieldname in enumerate(thesefieldnames):
                offset=longestcell[afieldname]
                thesefieldnames[nameloc]=thesefieldnames[nameloc].ljust(offset)
            writer=csv.DictWriter(outfile, thesefieldnames)
            writer.writeheader()
            for eachline in reader:
                thisline={}
                for eachfname in thesefieldnames:
                    thisline[eachfname]=eachline[eachfname.strip(' ')]
                writer.writerow(thisline)


def reductlengthandremovenewlines():
    with open(infilename) as infile:
        thisreader=csv.DictReader(infile)
        theseheadings=[k[:30].rjust(50, ' ') for k in thisreader.fieldnames]
        with open(outfilename, 'wb') as outfile:
            thiswriter=csv.DictWriter(outfile, theseheadings)
            thiswriter.writeheader()
            for eachline in thisreader:
                thisline={}
                for eachfieldname in thisreader.fieldnames:
                    thisline[eachfieldname[:30].rjust(50, ' ')]=eachline[eachfieldname].replace('\n', '\\n')[:30].rjust(50, ' ')
                thiswriter.writerow(thisline)