-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreduceCSVToFindPattern.py
More file actions
155 lines (144 loc) · 6.82 KB
/
reduceCSVToFindPattern.py
File metadata and controls
155 lines (144 loc) · 6.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: jeremiah.marks
# @Date: 2015-06-15 17:46:35
# @Last Modified 2015-06-29
# @Last Modified time: 2015-06-29 20:21:46
# This script will open a csv file and go through it
# cell-by-cell, row-by-row and count the number of characters
# in each cell. If the cell is empty, it will put a zero.
# If there is content in the cell, it will put a 1
# This can then be opened in excel using conditional formatting
# to get a color coded, visual representation of where data is in the file.
import csv
infilename='products.csv'
outfilename='productsBin.csv'
def leavefieldnames():
with open(infilename) as infile:
reader=csv.DictReader(infile, restkey="valuesOutsideOfTable")
with open(outfilename, 'wb') as outfile:
writer=csv.DictWriter(outfile, reader.fieldnames + ["valuesOutsideOfTable"])
writer.writeheader()
for eachrow in reader:
tempthing={}
for eachcol in eachrow.keys():
if len(eachrow[eachcol])>0:
tempthing[eachcol]="1"
else:
tempthing[eachcol]="0"
writer.writerow(tempthing)
def replacefieldnames():
with open(infilename) as infile:
reader=csv.DictReader(infile, restkey="valuesOutsideOfTable")
allfields=reader.fieldnames + ["valuesOutsideOfTable"]
for fieldloc, fieldname in enumerate(allfields):
allfields[fieldloc]=fieldloc
with open(outfilename, 'wb') as outfile:
writer=csv.DictWriter(outfile, allfields, restval='0')
writer.writeheader()
for eachrow in reader:
tempthing={}
for eachloc, eachcol in enumerate(eachrow.keys()):
if len(eachrow[eachcol])>0:
tempthing[eachloc]="1"
else:
tempthing[eachloc]="0"
writer.writerow(tempthing)
def checkrowtypes():
import csv
infilename='./droppings/products.csv'
outfilename='./droppings/productsBin.csv'
with open(infilename) as infile:
reader=csv.DictReader(infile, restkey="valuesOutsideOfTable")
allfields=reader.fieldnames + ["valuesOutsideOfTable"]
fieldsasnums=allfields[:]
for fieldloc, fieldname in enumerate(fieldsasnums):
fieldsasnums[fieldloc]=fieldloc
with open(outfilename, 'wb') as outfile:
rowtypes=set() # This will be used to ensure that
# we keep track of unique sets.
filekeeper={} # This will hold various file process
# that each rowtype will need.
names=set()
patternnames={}
writer=csv.DictWriter(outfile, fieldsasnums, restval='0')
writer.writeheader()
ataglancefile=open('./droppings/ataglance.csv.txt', 'w+')
filekeeper['ataglance']=ataglancefile
writerkeeper={}
for eachrow in reader:
linenum=reader.line_num
tempthing={}
for eachloc, eachcol in enumerate(eachrow.keys()):
if len(eachrow[eachcol])>0:
tempthing[eachloc]="1"
else:
tempthing[eachloc]="0"
if fieldsasnums[-1] not in tempthing.keys():
tempthing[fieldsasnums[-1]]="0"
thisPattern=''.join([tempthing[k] for k in fieldsasnums])
if thisPattern not in rowtypes:
rowtypes.add(thisPattern)
filekeeper[thisPattern]=open("./droppings/" + thisPattern + ".csv", 'wb')
thiswriter=csv.DictWriter(filekeeper[thisPattern], allfields, restval='0')
thiswriter.writeheader()
writerkeeper[thisPattern]=thiswriter
print "\nName this type of row: \n"
print eachrow
patternname=raw_input("Please use something like product, option, or contact\n").strip(" \n")
if patternname not in names:
names.add(patternname)
filekeeper[patternname]=open("./droppings/" + patternname + ".csv", 'wb')
thispatternwriter=csv.DictWriter(filekeeper[patternname], allfields, restval='0')
thispatternwriter.writeheader()
writerkeeper[patternname]=thispatternwriter
patternnames[thisPattern]=patternname
writer.writerow(tempthing)
writerkeeper[patternnames[thisPattern]].writerow(eachrow)
writerkeeper[thisPattern].writerow(eachrow)
ataglancefile.write(str(linenum) + "," + patternnames[thisPattern] + "," + thisPattern + "\n")
return [k.close() for k in filekeeper.values()]
def expandcells():
"""This method will add whitespace to every cell until
every cell in the column is the same width.
"""
import csv
infilename='./droppings/products.csv'
outfilename='./droppings/productsBin.csv'
fillchar=' '
longestcell={}
with open(infilename) as infile:
reader=csv.DictReader(infile)
for eachrow in reader:
for eachcell in reader.fieldnames:
celllength=len(eachcell)
if eachcell not in longestcell.keys():
longestcell[eachcell]=celllength
if celllength > longestcell[eachcell]:
longestcell[eachcell]=celllength
with open(infilename) as infile:
reader=csv.DictReader(infile)
with open(outfilename, 'wb') as outfile:
thesefieldnames=reader.fieldnames
for nameloc, afieldname in enumerate(thesefieldnames):
offset=longestcell[afieldname]
thesefieldnames[nameloc]=thesefieldnames[nameloc].ljust(offset)
writer=csv.DictWriter(outfile, thesefieldnames)
writer.writeheader()
for eachline in reader:
thisline={}
for eachfname in thesefieldnames:
thisline[eachfname]=eachline[eachfname.strip(' ')]
writer.writerow(thisline)
def reductlengthandremovenewlines():
with open(infilename) as infile:
thisreader=csv.DictReader(infile)
theseheadings=[k[:30].rjust(50, ' ') for k in thisreader.fieldnames]
with open(outfilename, 'wb') as outfile:
thiswriter=csv.DictWriter(outfile, theseheadings)
thiswriter.writeheader()
for eachline in thisreader:
thisline={}
for eachfieldname in thisreader.fieldnames:
thisline[eachfieldname[:30].rjust(50, ' ')]=eachline[eachfieldname].replace('\n', '\\n')[:30].rjust(50, ' ')
thiswriter.writerow(thisline)