Apriori算法在购物篮分析中的运用

  购物篮分析是一个很经典的数据挖掘案例,运用到了Apriori算法。下面从网上下载的一超市某月份的数据库,利用Apriori算法进行管理分析。例子使用Python+MongoDB

  处理过程1 数据建模(将Excel中的数据写入到MongoDB数据库), 2 从数据库中读取数据进行分析。

  Excel文件http://download.csdn.net/detail/artscrafts/6805689

  案例配置文件 setting.py

1 data_source = 'supermarket.xls'
2 host = 'localhost'
3 port = 27017
4 db_name = 'shopping_basket'
5 items_name = 'goods_items'
6 record_name = 'transaction_record'

  读取Excel数据到MongoDB中 load_basket.py

 1 from xlrd import open_workbook
 2 from pymongo import MongoClient
 3 import setting
 4 
 5 wb = open_workbook(setting.data_source, encoding_override='utf-8')
 6 client = MongoClient(setting.host, setting.port)
 7 db = client[setting.db_name]
 8 items = []
 9 
10 #read xls
11 def read_one_line(workbook, sheet_index=0, row_index=0, start_col_index=0):
12     sheet = workbook.sheets()[0]
13     max_row = sheet.nrows
14     max_col = sheet.ncols
15     start_col_index = (start_col_index if (start_col_index > 0 and start_col_index <= max_col) else max_col)
16     if row_index < 0 or row_index >= max_row:
17         raise IndexError()
18     for col_index in xrange(start_col_index, max_col):
19         yield sheet.cell(row_index, col_index).value
20 
21 #read xls
22 def readlines(workbook, sheet_index=0, start_row_index=0, end_row_index=None, start_col_index=0, end_col_index=None):
23     sheet = workbook.sheets()[sheet_index]
24     max_row = sheet.nrows
25     max_col = sheet.ncols
26     end_row_index = (end_row_index if end_row_index  else max_row)
27     end_col_index = (end_col_index if end_col_index  else max_col)
28     for row_index in xrange(start_row_index, end_row_index):
29         yield [sheet.cell(row_index, col_index).value for col_index in xrange(start_col_index, end_col_index)]
30 
31 #from xls to mongodb
32 def load_items():
33     collection = db[setting.items_name]
34     items_line = read_one_line(wb, row_index=1, start_col_index=1)
35     id = 1
36     tmp = []
37     for item in items_line:
38         if id % 100 == 0:
39             collection.insert(tmp)
40             tmp = []
41         tmp.append({'id':id, 'name':item})
42         items.append(item)
43         id += 1
44 
45 # from xls to mongodb
46 def load_record():
47     collection = db[setting.record_name]
48     lines = readlines(wb,start_row_index=2, start_col_index = 1)
49     tmp = []
50     id = 1
51     for line in lines:
52         if id % 100 == 0:
53             collection.insert(tmp)
54             tmp = []
55         tmp.append({'id':id, 'items':[items[i] for i in xrange(len(line)) if line[i] == 'T']})
56         id += 1
57 
58 
59 def main():
60     print '........start loading........'
61     load_items()
62     load_record()
63     client.close()
64     print '.........end loading.........'
65 
66 if __name__ == '__main__':
67     main()

  进行数据分析 analysis_basket.py

 1 #Apriori
 2 from pymongo import MongoClient
 3 import setting
 4 
 5 client = MongoClient(setting.host, setting.port)
 6 db = client[setting.db_name]
 7 data = []
 8 
 9 #from mongodb to items
10 def filldata():
11     collection = db[setting.record_name]
12     cur = collection.find()
13     for row in cur:
14         data.append(row['items'])
15 
16 def connect(items):
17     result = {}
18     keys = items.keys()
19     length = len(keys)
20     for i in range(length):
21         prev = keys[i][:len(keys[i]) - 1]
22         for j in range(i + 1, length):
23             tmp = keys[j][:len(keys[j]) - 1]
24             if prev == tmp:
25                 key = keys[i] + (keys[j][len(keys[i]) - 1],)
26                 result[key] = getsupp(key)
27             else:
28                 break
29     return result
30 
31 
32 def pruning(items, minsupp):
33     result = {}
34     for key in items.keys():
35         if items[key] >= minsupp:
36             result[key] = items[key]
37     return result
38 
39 def contain(par, sub):
40     for v in sub:
41         if not v in par:
42             return False
43     return True
44 
45 
46 def getsupp(item):
47     supp = 0
48     for row in data:
49         if contain(row, item):
50             supp+=1
51     return supp
52 
53 def apriori(data, minsupp, k):
54     candidate_set = {}
55     for row in data:
56         for i in row:
57             key = (i,)
58             candidate_set[key] = candidate_set.get(key, 0) + 1
59     frequently_set = pruning(candidate_set, minsupp)
60     result = {}
61     result['k=1'] = frequently_set
62     for n in range(2, k):
63         candidate_set = connect(frequently_set)
64         frequently_set = pruning(candidate_set, minsupp)
65         if len(frequently_set) <= 1:
66             return result
67         result['K=' + str(n)] = frequently_set
68     return result
69 
70 def main():
71     filldata()
72     client.close()
73     res = apriori(data, 30, 8)
74 
75 
76 if __name__ == '__main__':
77     main()

 

  

  

posted on 2014-01-03 21:05  Arts&Crafts  阅读(1938)  评论(0编辑  收藏  举报

导航