Apriori算法在购物篮分析中的运用
购物篮分析是一个很经典的数据挖掘案例,运用到了Apriori算法。下面从网上下载的一超市某月份的数据库,利用Apriori算法进行管理分析。例子使用Python+MongoDB
处理过程1 数据建模(将Excel中的数据写入到MongoDB数据库), 2 从数据库中读取数据进行分析。
Excel文件http://download.csdn.net/detail/artscrafts/6805689
案例配置文件 setting.py
1 data_source = 'supermarket.xls' 2 host = 'localhost' 3 port = 27017 4 db_name = 'shopping_basket' 5 items_name = 'goods_items' 6 record_name = 'transaction_record'
读取Excel数据到MongoDB中 load_basket.py
1 from xlrd import open_workbook 2 from pymongo import MongoClient 3 import setting 4 5 wb = open_workbook(setting.data_source, encoding_override='utf-8') 6 client = MongoClient(setting.host, setting.port) 7 db = client[setting.db_name] 8 items = [] 9 10 #read xls 11 def read_one_line(workbook, sheet_index=0, row_index=0, start_col_index=0): 12 sheet = workbook.sheets()[0] 13 max_row = sheet.nrows 14 max_col = sheet.ncols 15 start_col_index = (start_col_index if (start_col_index > 0 and start_col_index <= max_col) else max_col) 16 if row_index < 0 or row_index >= max_row: 17 raise IndexError() 18 for col_index in xrange(start_col_index, max_col): 19 yield sheet.cell(row_index, col_index).value 20 21 #read xls 22 def readlines(workbook, sheet_index=0, start_row_index=0, end_row_index=None, start_col_index=0, end_col_index=None): 23 sheet = workbook.sheets()[sheet_index] 24 max_row = sheet.nrows 25 max_col = sheet.ncols 26 end_row_index = (end_row_index if end_row_index else max_row) 27 end_col_index = (end_col_index if end_col_index else max_col) 28 for row_index in xrange(start_row_index, end_row_index): 29 yield [sheet.cell(row_index, col_index).value for col_index in xrange(start_col_index, end_col_index)] 30 31 #from xls to mongodb 32 def load_items(): 33 collection = db[setting.items_name] 34 items_line = read_one_line(wb, row_index=1, start_col_index=1) 35 id = 1 36 tmp = [] 37 for item in items_line: 38 if id % 100 == 0: 39 collection.insert(tmp) 40 tmp = [] 41 tmp.append({'id':id, 'name':item}) 42 items.append(item) 43 id += 1 44 45 # from xls to mongodb 46 def load_record(): 47 collection = db[setting.record_name] 48 lines = readlines(wb,start_row_index=2, start_col_index = 1) 49 tmp = [] 50 id = 1 51 for line in lines: 52 if id % 100 == 0: 53 collection.insert(tmp) 54 tmp = [] 55 tmp.append({'id':id, 'items':[items[i] for i in xrange(len(line)) if line[i] == 'T']}) 56 id += 1 57 58 59 def main(): 60 print '........start loading........' 61 load_items() 62 load_record() 63 client.close() 64 print '.........end loading.........' 65 66 if __name__ == '__main__': 67 main()
进行数据分析 analysis_basket.py
1 #Apriori 2 from pymongo import MongoClient 3 import setting 4 5 client = MongoClient(setting.host, setting.port) 6 db = client[setting.db_name] 7 data = [] 8 9 #from mongodb to items 10 def filldata(): 11 collection = db[setting.record_name] 12 cur = collection.find() 13 for row in cur: 14 data.append(row['items']) 15 16 def connect(items): 17 result = {} 18 keys = items.keys() 19 length = len(keys) 20 for i in range(length): 21 prev = keys[i][:len(keys[i]) - 1] 22 for j in range(i + 1, length): 23 tmp = keys[j][:len(keys[j]) - 1] 24 if prev == tmp: 25 key = keys[i] + (keys[j][len(keys[i]) - 1],) 26 result[key] = getsupp(key) 27 else: 28 break 29 return result 30 31 32 def pruning(items, minsupp): 33 result = {} 34 for key in items.keys(): 35 if items[key] >= minsupp: 36 result[key] = items[key] 37 return result 38 39 def contain(par, sub): 40 for v in sub: 41 if not v in par: 42 return False 43 return True 44 45 46 def getsupp(item): 47 supp = 0 48 for row in data: 49 if contain(row, item): 50 supp+=1 51 return supp 52 53 def apriori(data, minsupp, k): 54 candidate_set = {} 55 for row in data: 56 for i in row: 57 key = (i,) 58 candidate_set[key] = candidate_set.get(key, 0) + 1 59 frequently_set = pruning(candidate_set, minsupp) 60 result = {} 61 result['k=1'] = frequently_set 62 for n in range(2, k): 63 candidate_set = connect(frequently_set) 64 frequently_set = pruning(candidate_set, minsupp) 65 if len(frequently_set) <= 1: 66 return result 67 result['K=' + str(n)] = frequently_set 68 return result 69 70 def main(): 71 filldata() 72 client.close() 73 res = apriori(data, 30, 8) 74 75 76 if __name__ == '__main__': 77 main()
posted on 2014-01-03 21:05 Arts&Crafts 阅读(1938) 评论(0) 编辑 收藏 举报