R 代码积累
R 代码积累不定期更新
1.阶乘、递归、reduce、sprintf
#NO.1 # 阶乘函数 fact <- function(n){ if(n==0) return(1) #基例在这 else return(n*fact(n-1)) } #1+2!+3!+...+20!的和 #测试 Reduce('+', lapply(1:3,fact)) 结果是9 Reduce('+', lapply(1:20,fact)) #NO.2 #判断101-200之间有多少个素数,并输出所有素数 is.prime <- function(num) { if (num == 2) { TRUE } else if (any(num %% 2:(num-1) == 0)) { FALSE } else { TRUE } } #101共有多少个素数 Reduce('+', lapply(101:200,is.prime)) #打印出每一个素数 for(i in 101:200){ if(!is.prime(i)) next print(sprintf('%d 是素数 %s',i,is.prime(i))) }
2.MD5加密卡号
library(magrittr) library(digest) library(data.table) library(readxl) library(lubridate) dat=read_excel('data_union.xlsx',sheet=4) dim(dat) get_len <- function(a){ return(paste0(nchar(a),a)) } dat$card_len=lapply(dat$bank_card, get_len)%>%unlist() dat$card_md5 <- lapply(dat$card_len, digest, algo="md5",serialize=F)%>%unlist() dat$card_md5_upper <- toupper(dat$card_md5) write.table( dat[,c('card_md5_upper')] ,'yqb_card_md5.txt',row.names = F ,col.names=F ,quote = F) #ym_apply dat$ym_apply=dat$time %m-% months(1)%>% format("%Y%m")%>%as.numeric() #write out card_md5 and ym_apply write.table( dat[c('card_md5_upper','ym_apply')] ,sep=',' ,'yqb_card_ym_md5.txt',row.names = F ,col.names=F ,quote = F) unique(dat$ym_apply)
3.时间函数
https://cran.r-project.org/web/packages/lubridate/vignettes/lubridate.html
4.随机森林可视化
library(randomForest) library(kernlab) # spam数据集 来源于这个 包 library(magrittr) library(caret) #setwd('...') #data <- read.csv('...') data(spam) #加载数据集 # spam 是一个数据集,一条记录代表一封邮件包含哪些特殊词 # ,这封邮件是否是垃圾邮件(y变量)等等一些变量 View(spam) table(spam$type) #看一下Y变量的分布,是否存在不均衡(imbalanced data) #type 是y变量 也可以是0,1,但注意,如果你的数据集是 df, 要预先 将y变量转换为factor ,df$y <- as.factor(df$y) set.seed(2016) #为使模型可再现,这里预先设定随机种子,因为随机森林的 每棵树对行是进行随机有放回的抽样的 rf <- randomForest(type~. ,data=spam ,importance=TRUE #保留变量的重要性 TURE的时候,可通过rf$importance 查看 ,proximity=FALSE #在n棵树种,一个矩阵任意两条记录 落在同一个叶子节点的概率,可以表示两条记录的相似程度 ,ntree=500 #种多少棵树 ,do.trace=20 #每20条显示 误分率 ,na.action=na.omit#一行中只要存在一个NA,这条记录就删掉 ,strata=spam$type #按照Y变量中(0,1的比例,进行上下采样,对占比少的用oversampling,对占比多的用downsampling) ,sampsize=c(1500,1500) #对0,1采样的个数分别是多少,都是有放回的 ,mtry=7 #每一棵决策树,选取几个feature?,对于classification 一般是feature总数的开平方(这个是default) ,keep.forest=FALSE) #保留每一棵数 rf$confusion #混淆矩阵 ##变量重要性,如果你的数据是有上千个变量,可以根据变量的重要性对数据进行降维 par(mfrow = c(2,2)) for(i in 1:4){ plot(sort(rf$importance[,i],decreasing =TRUE)%>%head(20) ,type='h' ,main=paste0(colnames(rf$importance)[i]) ,xlab='variable' ,ylab='importance') text(sort(rf$importance[,i],decreasing =TRUE) ,labels=names(sort(rf$importance[,i]))%>%head(20) ,pos=1 ,cex=0.9) } ## 下面画ROC 曲线,计算AUC library(ROCR) predictions=as.vector(rf$votes[,2]) pred=prediction(predictions,spam$type) perf_AUC=performance(pred,"auc") #Calculate the AUC value AUC=perf_AUC@y.values[[1]] perf_ROC=performance(pred,"tpr","fpr") #plot the actual ROC curve plot(perf_ROC, main="ROC plot") text(0.5,0.5,paste("AUC = ",format(AUC, digits=5, scientific=FALSE))) #cutoff accuracy perf <- performance(pred, measure="acc", x.measure="cutoff") # Get the cutoff for the best accuracy bestAccInd <- which.max(perf@"y.values"[[1]]) bestMsg <- paste("best accuracy=", perf@"y.values"[[1]][bestAccInd], " at cutoff=", round(perf@"x.values"[[1]][bestAccInd], 4)) plot(perf, sub=bestMsg) abline(v=round(perf@"x.values"[[1]][bestAccInd], 4)) # calculate the confusion matrix and plot cm <- confusionMatrix(rf$predicted, reference = spam$type) draw_confusion_matrix(cm) #confusion matrix visualization draw_confusion_matrix <- function(cm) { layout(matrix(c(1,1,2))) par(mar=c(2,2,2,2)) plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n') title('CONFUSION MATRIX', cex.main=2) # create the matrix rect(150, 430, 240, 370, col='#3F97D0') text(195, 435, rf$classes[1], cex=1.2) rect(250, 430, 340, 370, col='#F7AD50') text(295, 435, rf$classes[2], cex=1.2) text(125, 370, 'Predicted', cex=1.3, srt=90, font=2) text(245, 450, 'Actual', cex=1.3, font=2) rect(150, 305, 240, 365, col='#F7AD50') rect(250, 305, 340, 365, col='#3F97D0') text(140, 400, rf$classes[1], cex=1.2, srt=90) text(140, 335, rf$classes[2], cex=1.2, srt=90) # add in the cm results res <- as.numeric(cm$table) text(195, 400, res[1], cex=1.6, font=2, col='white') text(195, 335, res[2], cex=1.6, font=2, col='white') text(295, 400, res[3], cex=1.6, font=2, col='white') text(295, 335, res[4], cex=1.6, font=2, col='white') # add in the specifics plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n') text(10, 85, names(cm$byClass[1]), cex=1.2, font=2) text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2) text(30, 85, names(cm$byClass[2]), cex=1.2, font=2) text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2) text(50, 85, names(cm$byClass[5]), cex=1.2, font=2) text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2) text(70, 85, names(cm$byClass[6]), cex=1.2, font=2) text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2) text(90, 85, names(cm$byClass[7]), cex=1.2, font=2) text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2) # add in the accuracy information text(30, 35, names(cm$overall[1]), cex=1.5, font=2) text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4) text(70, 35, names(cm$overall[2]), cex=1.5, font=2) text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4) }
5.一个多线程 爬虫
## 99健康网 医院抓取 ## #step1:by city url_0='http://yyk.99.com.cn/city.html' content_0=url_0%>%read_html() city_nm=c( content_0%>%html_nodes('.cityarea li a')%>%html_text(trim=T) ,content_0%>%html_nodes('.cityarea .background li a')%>%html_text(trim=T) ) city_url=c( content_0%>%html_nodes('.cityarea li a')%>%html_attr('href')%>%paste0('http://yyk.99.com.cn',.) ,content_0%>%html_nodes('.cityarea .background li a')%>%html_attr('href')%>%paste0('http://yyk.99.com.cn',.) ) length(city_nm) length(city_url) #step2:get hospital by city get_hospital_1 <- function(i){ # i=1 content_1=city_url[i]%>%read_html(encoding='gbk') hospital_nm=content_1%>%html_nodes('.tablist li span')%>%html_text(trim=T) hospital_url=content_1%>%html_nodes('.tablist li a')%>%html_attr('href')%>%paste0(.,'jianjie.html') return(data.frame( stringsAsFactors = F ,city_nm=rep(city_nm[i],length(hospital_nm)) ,hospital_nm=hospital_nm ,hospital_url=hospital_url )) } system.time({ x <- 301:length(city_url) res_99jk_3 <- do.call('rbind',lapply(x,get_hospital_1)) }) write.table(res_99jk_1,'res_99jk_1.csv',row.names = F,sep='\t') write.table(res_99jk_2,'res_99jk_2.csv',row.names = F,sep='\t') write.table(res_99jk_3,'res_99jk_3.csv',row.names = F,sep='\t') res_99jk <- rbind(res_99jk_1 ,res_99jk_2 ,res_99jk_3) #step3 get all fileds level beds_cnt... res_99jk <- read.table('res_99jk.csv',header = T) hospital_url_pool=res_99jk$hospital_url%>%unique() get_detail <- function(hospital_url){ tryCatch({ res=hospital_url%>%read_html(encoding='gbk')%>%html_nodes('.tdr')%>%html_text(trim=T) df=data.frame(stringsAsFactors = F ,hospital_url=hospital_url ,nm=res[1] ,addr=res[2] ,hp_admin=res[3] ,start_dt=res[4] ,type=res[5] ,level=res[6] ,section_cnt=res[7] ,doc_cnt=res[8] ,bed_cnt=res[9] ,traffic=res[10] ,insurance=res[11] ,web_site=res[12] ,phone=res[13] ,addr=res[14] ,postcode=res[15]) },error = function(e) { rm(df) df=data.frame(stringsAsFactors = F ,hospital_url=hospital_url ,nm=NA ,addr=NA ,hp_admin=NA ,start_dt=NA ,type=NA ,level=NA ,section_cnt=NA ,doc_cnt=NA ,bed_cnt=NA ,traffic=NA ,insurance=NA ,web_site=NA ,phone=NA ,addr=NA ,postcode=NA) }) return(df) } op <- pboptions(type = "timer") x <- hospital_url_pool[1:5000] system.time(rdf_99jk_1 <- do.call(rbind.data.frame,pblapply(x,get_detail))) pboptions(op) write.table(rdf_99jk_1,'rdf_99jk_1_5000.csv',row.names = F,sep='\t') ##多线程爬虫 pkg <- c('RMySQL','DBI','xml2','rvest','magrittr','xml2','stringr','httpuv','R2HTML','pbapply','curl') for(i in pkg){ if(!i %in% installed.packages()[,1]) (install.packages(i)) } library(magrittr) library(bitops) library(rvest) library(stringr) library(DBI) library(RCurl) library(curl) library(sqldf) library(gdata) library(xml2) library(pbapply) library(parallel) res_99jk <- read.table('res_99jk.csv',header = T,stringsAsFactors = F) hospital_url_pool=res_99jk$hospital_url%>%unique() get_detail <- function(hospital_url){ tryCatch({ res=hospital_url%>%read_html(encoding='gbk')%>%html_nodes('.tdr')%>%html_text(trim=T) df=data.frame(stringsAsFactors = F ,hospital_url=hospital_url ,nm=res[1] ,addr=res[2] ,hp_admin=res[3] ,start_dt=res[4] ,type=res[5] ,level=res[6] ,section_cnt=res[7] ,doc_cnt=res[8] ,bed_cnt=res[9] ,traffic=res[10] ,insurance=res[11] ,web_site=res[12] ,phone=res[13] ,addr=res[14] ,postcode=res[15]) },error = function(e) { rm(df) df=data.frame(stringsAsFactors = F ,hospital_url=hospital_url ,nm=NA ,addr=NA ,hp_admin=NA ,start_dt=NA ,type=NA ,level=NA ,section_cnt=NA ,doc_cnt=NA ,bed_cnt=NA ,traffic=NA ,insurance=NA ,web_site=NA ,phone=NA ,addr=NA ,postcode=NA) }) return(df) } # 用system.time来返回计算所需时间 system.time({ x <- hospital_url_pool[5001:length(hospital_url_pool)] cl <- makeCluster(4) # 初始化四核心集群 clusterEvalQ(cl, library(magrittr)) clusterEvalQ(cl, library(rvest)) clusterEvalQ(cl, library(RCurl)) clusterEvalQ(cl, library(curl)) clusterEvalQ(cl, library(xml2)) results <- parLapply(cl,x,get_detail) # lapply的并行版本 res.df <- do.call('rbind',results) # 整合结果 stopCluster(cl) # 关闭集群 }) # res2=do.call(rbind.data.frame,results) res2=data.frame(stringsAsFactors = F) for(i in 1:length(results)){ print(paste0('this is ',i)) if (!identical(dim(results[[i]]),dim(results[[1]]))) next res2=rbind(res2,results[[i]]) }