From b75ea56294579e54b43c8cab3aadee15e94ea1e7 Mon Sep 17 00:00:00 2001 From: Gavin Lee Date: Fri, 3 Nov 2017 04:00:39 +0800 Subject: [PATCH 1/5] Edit fp_growth.py to python-3 version Without change test file, it's still Python 2 version --- fp_growth.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fp_growth.py b/fp_growth.py index 4ada47d..3d750ff 100644 --- a/fp_growth.py +++ b/fp_growth.py @@ -10,7 +10,6 @@ """ from collections import defaultdict, namedtuple -from itertools import imap __author__ = 'Eric Naeseth ' __copyright__ = 'Copyright © 2009 Eric Naeseth' @@ -40,19 +39,20 @@ def find_frequent_itemsets(transactions, minimum_support, include_support=False) items[item] += 1 # Remove infrequent items from the item support dictionary. - items = dict((item, support) for item, support in items.iteritems() - if support >= minimum_support) + tmpList = [(item, support) for item, support in items.items() + if support >= minimum_support] + items = dict(tmpList) # Build our FP-tree. Before any transactions can be added to the tree, they # must be stripped of infrequent items and their surviving items must be # sorted in decreasing order of frequency. def clean_transaction(transaction): transaction = filter(lambda v: v in items, transaction) - transaction.sort(key=lambda v: items[v], reverse=True) + transaction = sorted(transaction, key=lambda v: items[v], reverse=True) return transaction master = FPTree() - for transaction in imap(clean_transaction, transactions): + for transaction in list(map(clean_transaction, transactions)): master.add(transaction) def find_with_suffix(tree, suffix): @@ -136,7 +136,7 @@ def items(self): element of the tuple is the item itself, and the second element is a generator that will yield the nodes in the tree that belong to the item. """ - for item in self._routes.iterkeys(): + for item in self._routes.keys(): yield (item, self.nodes(item)) def nodes(self, item): @@ -167,15 +167,15 @@ def collect_path(node): return (collect_path(node) for node in self.nodes(item)) def inspect(self): - print 'Tree:' + print('Tree:') self.root.inspect(1) - print - print 'Routes:' + print() + print('Routes:') for item, nodes in self.items(): - print ' %r' % item + print(' %r' % item) for node in nodes: - print ' %r' % node + print(' %r' % node) def conditional_tree_from_paths(paths): """Build a conditional FP-tree from the given prefix paths.""" @@ -312,7 +312,7 @@ def children(self): return tuple(self._children.itervalues()) def inspect(self, depth=0): - print (' ' * depth) + repr(self) + print ((' ' * depth) + repr(self)) for child in self.children: child.inspect(depth + 1) @@ -355,4 +355,4 @@ def __repr__(self): result = sorted(result, key=lambda i: i[0]) for itemset, support in result: - print str(itemset) + ' ' + str(support) + print(str(itemset) + ' ' + str(support)) From 992d2e8cabf8e22384395bed8d594224eef3eeb8 Mon Sep 17 00:00:00 2001 From: Gavin Lee Date: Fri, 3 Nov 2017 04:07:25 +0800 Subject: [PATCH 2/5] Change symtax Also Edit Readme to python-3 --- Readme.md | 4 ++-- fp_growth.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Readme.md b/Readme.md index b15cdaa..17fb5a2 100644 --- a/Readme.md +++ b/Readme.md @@ -37,11 +37,11 @@ Once installed, the module can also be used as a stand-alone script. It will read a list of transactions formatted as a CSV file. (An example of such a file in included in the `examples` directory.) - python -m fp_growth -s {minimum support} {path to CSV file} + python3 -m fp_growth -s {minimum support} {path to CSV file} For example, to find the itemsets with support ≥ 4 in the included example file: - python -m fp_growth -s 4 examples/tsk.csv + python3 -m fp_growth -s 4 examples/tsk.csv References ---------- diff --git a/fp_growth.py b/fp_growth.py index 3d750ff..0da58fe 100644 --- a/fp_growth.py +++ b/fp_growth.py @@ -39,9 +39,8 @@ def find_frequent_itemsets(transactions, minimum_support, include_support=False) items[item] += 1 # Remove infrequent items from the item support dictionary. - tmpList = [(item, support) for item, support in items.items() - if support >= minimum_support] - items = dict(tmpList) + items = dict((item, support) for item, support in items.items() + if support >= minimum_support) # Build our FP-tree. Before any transactions can be added to the tree, they # must be stripped of infrequent items and their surviving items must be From 06ccf3607e8b59d643c7ee1e6c7b2bf8fd7df43d Mon Sep 17 00:00:00 2001 From: Gavin Lee Date: Fri, 3 Nov 2017 23:24:54 +0800 Subject: [PATCH 3/5] Add association rule to the script --- Readme.md | 19 ++++++++++++++-- fp_growth.py | 61 +++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 73 insertions(+), 7 deletions(-) diff --git a/Readme.md b/Readme.md index 17fb5a2..daa7315 100644 --- a/Readme.md +++ b/Readme.md @@ -25,7 +25,7 @@ in your transactions with the following code: from fp_growth import find_frequent_itemsets for itemset in find_frequent_itemsets(transactions, minsup): print itemset - + Note that `find_frequent_itemsets` returns a generator of itemsets, not a greedily-populated list. Each item must be hashable (i.e., it must be valid as a member of a dictionary or a set). @@ -38,11 +38,26 @@ read a list of transactions formatted as a CSV file. (An example of such a file in included in the `examples` directory.) python3 -m fp_growth -s {minimum support} {path to CSV file} - + For example, to find the itemsets with support ≥ 4 in the included example file: python3 -m fp_growth -s 4 examples/tsk.csv +Also, sopport can be a support rate, for example: + + python3 -m fp_growth -s 0.3 examples/tsk.csv + +You can find association rules as well, as sample + + python3 -m fp_growth -f rule -c 0.4 -s 4 examples/tsk.csv + +We used -f to decide what you want to find, association rule or frequency itemset + +All parameter: + - `-s`: minimum support (count or rate is fine) + - `-c`: minimum confidence + - `-f`: problem solving (freq or rule) + References ---------- diff --git a/fp_growth.py b/fp_growth.py index 0da58fe..2ca4d77 100644 --- a/fp_growth.py +++ b/fp_growth.py @@ -32,6 +32,10 @@ def find_frequent_itemsets(transactions, minimum_support, include_support=False) """ items = defaultdict(lambda: 0) # mapping from items to their supports + # if useing support rate instead of support count + if 0 < minimum_support <= 1: + minimum_support = minimum_support * len(transactions) + # Load the passed-in transactions and count the support that individual # items have. for transaction in transactions: @@ -321,6 +325,37 @@ def __repr__(self): return "<%s %r (%r)>" % (type(self).__name__, self.item, self.count) +def subs(l): + """ + Used for assRule + """ + assert type(l) is list + if len(l) == 1: + return [l] + x = subs(l[1:]) + return x + [[l[0]] + y for y in x] + + +# Association rules +def assRule(freq, min_conf = 0.6): + """ + This assRule must input a dict for itemset -> support rate + And also can customize your minimum confidence + """ + assert type(freq) is dict + result = [] + for item, sup in freq.items(): + for subitem in subs(list(item)): + sb = [x for x in item if x not in subitem] + if sb == [] or subitem == []: continue + if len(subitem) == 1 and (subitem[0][0] == 'in' or subitem[0][0] == 'out'): + continue + conf = sup/freq[tuple(subitem)] + if conf >= min_conf: + result.append({'from':subitem, 'to':sb, 'sup':sup, 'conf':conf}) + return result + + if __name__ == '__main__': from optparse import OptionParser import csv @@ -330,10 +365,18 @@ def __repr__(self): help='Minimum itemset support (default: 2)') p.add_option('-n', '--numeric', dest='numeric', action='store_true', help='Convert the values in datasets to numerals (default: false)') + p.add_option('-c', '--minimum-confidence', dest='minconf', type='float', + help='Minimum rule confidence (default 0.6)') + p.add_option('-f', '--find', dest='find', type='str', + help='Finding freq(frequency itemsets) or rule(association rules) (default: freq)') p.set_defaults(minsup=2) p.set_defaults(numeric=False) - + p.set_defaults(minconf=0.6) + p.set_defaults(find='freq') options, args = p.parse_args() + + assert options.find == 'freq' or options.find == 'rule' + if len(args) < 1: p.error('must provide the path to a CSV file to read') @@ -349,9 +392,17 @@ def __repr__(self): transactions.append(row) result = [] + res_for_rul = {} for itemset, support in find_frequent_itemsets(transactions, options.minsup, True): result.append((itemset,support)) - - result = sorted(result, key=lambda i: i[0]) - for itemset, support in result: - print(str(itemset) + ' ' + str(support)) + res_for_rul[tuple(itemset)] = support + + if options.find == 'freq': + result = sorted(result, key=lambda i: i[0]) + for itemset, support in result: + print(str(itemset) + ' ' + str(support)) + if options.find == 'rule': + rules = assRule(res_for_rul, options.minconf) + for ru in rules: + print(str(ru['from']) + ' -> ' + str(ru['to'])) + print('support = ' + str(ru['sup']) + 'confindence = ' + str(ru['conf'])) From 75521274d7ca8301cc2077501dd9f4ddd643f403 Mon Sep 17 00:00:00 2001 From: Gavin Lee Date: Fri, 3 Nov 2017 23:26:48 +0800 Subject: [PATCH 4/5] Fix Readme bug --- Readme.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Readme.md b/Readme.md index daa7315..505bc72 100644 --- a/Readme.md +++ b/Readme.md @@ -54,9 +54,9 @@ You can find association rules as well, as sample We used -f to decide what you want to find, association rule or frequency itemset All parameter: - - `-s`: minimum support (count or rate is fine) - - `-c`: minimum confidence - - `-f`: problem solving (freq or rule) + * `-s`: minimum support (count or rate is fine) + * `-c`: minimum confidence + * `-f`: problem solving (freq or rule) References ---------- From 550dc22b0325d6ba3394c87d298e3f5623121206 Mon Sep 17 00:00:00 2001 From: Gavin Lee Date: Fri, 3 Nov 2017 23:27:42 +0800 Subject: [PATCH 5/5] Fix Readme bug --- Readme.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Readme.md b/Readme.md index 505bc72..8422f26 100644 --- a/Readme.md +++ b/Readme.md @@ -54,9 +54,9 @@ You can find association rules as well, as sample We used -f to decide what you want to find, association rule or frequency itemset All parameter: - * `-s`: minimum support (count or rate is fine) - * `-c`: minimum confidence - * `-f`: problem solving (freq or rule) +* `-s`: minimum support (count or rate is fine) +* `-c`: minimum confidence +* `-f`: problem solving (freq or rule) References ----------