关联规则-R语言实现
关联规则
shangfr
2015年10月29日
本文旨在演示r语言arules包的关联规则用法,以及利用arulesViz对结果进行可视化
关联规则是形如X→Y的蕴涵式,其中, X和Y分别称为关联规则的先导(antecedent或left-hand-side, LHS)和后继(consequent或right-hand-side, RHS) 。其中,关联规则XY,存在支持度和信任度。 For more details see关联规则.
r语言arules包提供了有效处理稀疏二元数据的数据结构,而且提供函数执Apriori和Eclat算法挖掘频繁项集、最大频繁项集、闭频繁项集和关联规则详见。
蘑菇数据data下载
r语言代码
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
##
## The following objects are masked from 'package:base':
##
## %in%, abbreviate, write
data=read.csv(file.choose(),head=F)
trans <- as(data,"transactions") #数据格式转换
#inspect(trans) #数据查看
image(trans [1:50])
itemFrequencyPlot(trans, support=0.5)
itemFrequencyPlot(trans, topN=10, horiz=T)
basketSize<-size(trans)
summary(basketSize)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23 23 23 23 23 23
itemFreq <- itemFrequency(trans)
itemCount <- (itemFreq/sum(itemFreq))*sum(basketSize)
summary(itemCount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4 156 600 1570 2346 8124
orderedItem <- sort(itemCount, decreasing = T)
orderedItem[1:10]
## V17=p V18=w V7=f V19=o V8=c V9=b V13=s V14=s V5=f V11=t
## 8124 7924 7914 7488 6812 5612 5176 4936 4748 4608
#求关联规则
rules <- apriori(trans,parameter=list(support=0.3,confidence=1))
##
## Parameter specification:
## confidence minval smax arem aval originalSupport support minlen maxlen
## 1 0.1 1 none FALSE TRUE 0.3 1 10
## target ext
## rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09) (c) 1996-2004 Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[119 item(s), 8124 transaction(s)] done [0.00s].
## sorting and recoding items ... [28 item(s)] done [0.00s].
## creating transaction tree ... done [0.02s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 done [0.00s].
## writing ... [4316 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(rules)
## set of 4316 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4 5 6 7 8 9
## 1 42 293 832 1244 1107 594 179 24
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 4.00 5.00 5.32 6.00 9.00
##
## summary of quality measures:
## support confidence lift
## Min. :0.3003 Min. :1 Min. :1.000
## 1st Qu.:0.3112 1st Qu.:1 1st Qu.:1.000
## Median :0.3299 Median :1 Median :1.025
## Mean :0.3540 Mean :1 Mean :1.141
## 3rd Qu.:0.3712 3rd Qu.:1 3rd Qu.:1.027
## Max. :1.0000 Max. :1 Max. :2.927
##
## mining info:
## data ntransactions support confidence
## trans 8124 0.3 1
inspect(rules[1:10])
## lhs rhs support confidence lift
## 1 {} => {V17=p} 1.0000000 1 1.000000
## 2 {V12=?} => {V17=p} 0.3052683 1 1.000000
## 3 {V9=n} => {V19=o} 0.3092073 1 1.084936
## 4 {V9=n} => {V7=f} 0.3092073 1 1.026535
## 5 {V9=n} => {V17=p} 0.3092073 1 1.000000
## 6 {V3=s} => {V17=p} 0.3146233 1 1.000000
## 7 {V20=e} => {V7=f} 0.3417036 1 1.026535
## 8 {V20=e} => {V17=p} 0.3417036 1 1.000000
## 9 {V23=d} => {V18=w} 0.3874938 1 1.025240
## 10 {V23=d} => {V17=p} 0.3874938 1 1.000000
edible <- subset(rules, rhs %in% c("V1=e"))
inspect(edible[1:10])
## lhs rhs support confidence lift
## 126 {V6=n,V11=t} => {V1=e} 0.3072378 1 1.930608
## 578 {V6=n,V9=b,V11=t} => {V1=e} 0.3072378 1 1.930608
## 581 {V6=n,V11=t,V19=o} => {V1=e} 0.3072378 1 1.930608
## 583 {V6=n,V7=f,V11=t} => {V1=e} 0.3072378 1 1.930608
## 585 {V6=n,V11=t,V18=w} => {V1=e} 0.3072378 1 1.930608
## 587 {V6=n,V11=t,V17=p} => {V1=e} 0.3072378 1 1.930608
## 590 {V6=n,V9=b,V19=o} => {V1=e} 0.3308715 1 1.930608
## 1595 {V6=n,V9=b,V11=t,V19=o} => {V1=e} 0.3072378 1 1.930608
## 1599 {V6=n,V7=f,V9=b,V11=t} => {V1=e} 0.3072378 1 1.930608
## 1603 {V6=n,V9=b,V11=t,V18=w} => {V1=e} 0.3072378 1 1.930608
#规则保存
write(rules, file="rules.csv", sep=",", quote=TRUE, row.names=FALSE)
rules_df <- as(rules, "data.frame")
利用arulesViz对结果进行可视化
#可视化
library(grid)
library(RColorBrewer)
library(arulesViz)
##
## Attaching package: 'arulesViz'
##
## The following object is masked from 'package:arules':
##
## abbreviate
##
## The following object is masked from 'package:base':
##
## abbreviate
mushroom.rules <- apriori(trans,parameter = list(support = 0.8, confidence = 1))
##
## Parameter specification:
## confidence minval smax arem aval originalSupport support minlen maxlen
## 1 0.1 1 none FALSE TRUE 0.8 1 10
## target ext
## rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09) (c) 1996-2004 Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[119 item(s), 8124 transaction(s)] done [0.02s].
## sorting and recoding items ... [5 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [16 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
plot(mushroom.rules,
control=list(jitter=2, col = rev(brewer.pal(9, "Greens")[4:9])),
shading = "lift")
plot(mushroom.rules, method="grouped",
control=list(k=100,col = rev(brewer.pal(9, "Greens")[4:9])))
plot(edible[1:20], measure="confidence", method="graph",
control=list(type="items"), shading = "lift")
plot(edible, method="paracoord", control=list(reorder=TRUE))