论文keywords和规则匹配的baseline
详细的思路可以参照小论文树立0317
关键词分为以下几类:
t/****一些通用的过滤词,这些通用的过滤词可以使用和节目一起出现的词语,结合tf-idf看出来么?*****/
public static String[] tvTerms={"观看","收看","节目","电视","表演","演出"};
public static String[] channelTerms={"央视","中央电视台","春晚","春节联欢晚会"};
public static String[] commentTerms={"赞","好看","精彩","失望","感动","吐槽","无聊"};
对于每一个节目:
节目演员、节目类别
以及基于节目演员和节目类别的拓展,这个具有天然的权重
过滤策略:
-
如果同时包含title和节目涉及的演员,label True
-
如果同时包含title和节目类别,label True
-
如果节目名称被双引号包围,label True
-
对于其他keywords,计算权重之和,如果权重之和大于阈值,label True
- 阈值的确定:(先不管keywords)<不过后面权重的木有做下去>
- 关于权重确定的java工程
package com.bobo.baseline; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import com.bobo.features.ActorsFeature; import com.bobo.features.CategoryFeature; import com.bobo.features.ExpandFeature; import com.bobo.features.GeneralRulesFeatures; import com.bobo.features.TitleFeature; import com.bobo.myinterface.MyFileFilter; import com.bobo.util.Constants; import com.bobo.util.FileUtil; public class KeywordAndRulesMatherBaseLine { private ArrayList<File> dealedList=new ArrayList<File>(); private ArrayList<File> keywordsOutList=new ArrayList<File>(); public static void main(String[] args) { KeywordAndRulesMatherBaseLine baseLine=new KeywordAndRulesMatherBaseLine(); baseLine.init(); baseLine.labelForAll(); System.out.println("整體執行完畢"); } private void init() { // 得到所有标注过的数据 FileUtil.showAllFiles(new File(Constants.DataDir+"/"+"raw_data"), new MyFileFilter(".dealed"), dealedList); for(int i=0;i<dealedList.size();i++){ String dealedPath=dealedList.get(i).getAbsolutePath(); String outPath=dealedPath.substring(0,dealedPath.lastIndexOf("."))+".keywordsMatch"; keywordsOutList.add(new File(outPath)); } } public void labelForAll(){ for(int i=0;i<dealedList.size();i++){ if(dealedList.get(i).getAbsolutePath().contains("时间都去哪儿")){ labelForFile(dealedList.get(i),keywordsOutList.get(i), Constants.ActorShijian,Constants.categoryGequ,"时间都去哪儿"); }else if(dealedList.get(i).getAbsolutePath().contains("团圆饭")){ labelForFile(dealedList.get(i),keywordsOutList.get(i), Constants.ActorTuanyuan,Constants.categoryMoshu,"团圆饭"); }else if(dealedList.get(i).getAbsolutePath().contains("说你什么好")){ labelForFile(dealedList.get(i),keywordsOutList.get(i), Constants.ActorShuoni,Constants.categoryXiangsheng,"说你什么好"); }else if(dealedList.get(i).getAbsolutePath().contains("我就这么个人")){ labelForFile(dealedList.get(i),keywordsOutList.get(i), Constants.ActorWojiu,Constants.categoryXiaopin,"我就这么个人"); }else if(dealedList.get(i).getAbsolutePath().contains("我的要求不算高")){ labelForFile(dealedList.get(i),keywordsOutList.get(i), Constants.ActorWode,Constants.categoryGequ,"我的要求不算高"); }else if(dealedList.get(i).getAbsolutePath().contains("扶不扶")){ labelForFile(dealedList.get(i),keywordsOutList.get(i), Constants.ActorFubu,Constants.categoryXiaopin,"扶不扶"); }else if(dealedList.get(i).getAbsolutePath().contains("人到礼到")){ labelForFile(dealedList.get(i),keywordsOutList.get(i), Constants.ActorRendao,Constants.categoryXiaopin,"人到礼到"); } System.out.println(keywordsOutList.get(i)+"处理完毕!"); } } public void labelForFile(File dealedFile,File keywordsFile, String[] actors, String[] categorys, String title){ FileReader fr=null; BufferedReader br=null; FileWriter fw=null; BufferedWriter bw=null; PrintWriter pw=null; String line=null; try{ fr=new FileReader(dealedFile); br=new BufferedReader(fr); fw=new FileWriter(keywordsFile); bw=new BufferedWriter(fw); pw=new PrintWriter(bw); while((line=br.readLine())!=null){ String[] lineArr=line.split("\t"); String weiboText=lineArr[lineArr.length-1]; pw.println(lineArr[0]+"\t"+labelForSingle(weiboText, actors, categorys, title)+"\t"+weiboText); } }catch(Exception e){ e.printStackTrace(); }finally{ try { br.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } pw.flush(); pw.close(); } } public Integer labelForSingle(String weiboText, String[] actors, String[] categorys, String title) { for (String actor : actors) { if (weiboText.contains(actor)) { return 1; } } for (String cate : categorys) { if (weiboText.contains(cate)) { return 1; } } for (String word : Constants.tvTerms) { if (weiboText.contains(word)) { return 1; } } for (String word : Constants.commentTerms) { if (weiboText.contains(word)) { return 1; } } if(!weiboText.contains("《")||!weiboText.contains(title)){ return 0; }else{ int symbolIndex=weiboText.indexOf("《"); int titleIndex=weiboText.indexOf(title); if(titleIndex==symbolIndex+1){ return 1; } } return 0; } } package com.bobo.util; public class Constants { public final static String RootDir="H:/paper_related/socialTvProgram"; public final static String DataDir="/media/新加卷/小论文实验/data/liweibo"; //时间都去哪儿 public final static String[] ActorShijian={"王铮亮"}; //我的要求不算高 public final static String[] ActorWode={"黄渤"}; //团员饭 public final static String[] ActorTuanyuan={"YIF","yif","Yif","王亦丰"}; //说你什么好 public final static String[] ActorShuoni={"曹云金","刘云天"}; //我就这么个人 public final static String[] ActorWojiu={"冯巩","曹随峰","蒋诗萌"}; //扶不扶 public final static String[] ActorFubu={"杜晓宇","马丽","沈腾"}; //人到礼到 public final static String[] ActorRendao={"郭子","郭冬临","邵峰","牛莉"}; /***节目类别*****/ public final static String[] categoryGequ={"歌","唱"} ; public final static String[] categoryXiaopin={"小品"}; public final static String[] categoryMoshu={"魔术"}; public final static String[] categoryXiangsheng={"相声"}; /****一些通用的过滤词*****/ public static String[] tvTerms={"观看","收看","节目","电视","表演","演出"}; public static String[] channelTerms={"央视","中央电视台","春晚","春节联欢晚会"}; public static String[] commentTerms={"赞","好看","精彩","吐槽","无聊","不错","给力","接地气"}; }
- 衡量指标的python工程
-
#!/usr/python #!-*-coding=utf8-*- import numpy as np import myUtil from sklearn import metrics root_dir="/media/新加卷/小论文实验/data/liweibo/raw_data" def loadAllFileWithSuffix(suffix): file_list=list() myUtil.traverseFile(root_dir,suffix,file_list) return file_list #inFilePath对应的是节目目录下的keywordsMatch文件,其格式是 真实分类“\t”预测分类“\t”微博文本内容 def testForEachFile(inFilePath): y_true=list() y_pred=list() print(inFilePath) with open(inFilePath) as inFile: for line in inFile: y_true.append(int(line.split("\t")[0])) y_pred.append(int(line.split("\t")[1])) precision=metrics.accuracy_score(y_true,y_pred) recall=metrics.recall_score(y_true,y_pred) accuracy=metrics.accuracy_score(y_true,y_pred) f=metrics.fbeta_score(y_true,y_pred,beta=1) print("precision:%0.2f,recall:%0.2f,f:%0.2f,accuracy:%0.2f"% (precision,recall,f,accuracy)) return (precision,recall,accuracy,f) #依次对每个文件调用testForEachFile,计算precison,recall,accuracy,f def testForAll(inFileList): mean_precision=0.0 mean_recall=0.0 mean_accuracy=0.0 mean_f=0.0 for inFilePath in inFileList: (precison,recall,accuracy,f)=testForEachFile(inFilePath) mean_precision+=precison mean_recall+=recall mean_accuracy+=accuracy mean_f+=f listLen=len(inFileList) mean_precision/=listLen mean_recall/=listLen mean_accuracy/=listLen mean_f/=listLen print("所有节目各项目指标的平均值:") print("mean_precision:%0.2f,mean_recall:%0.2f,mean_f:%0.2f,mean_accuracy:%0.2f"% (mean_precision,mean_recall,mean_f,mean_accuracy)) return(mean_precision,mean_recall,mean_accuracy,mean_f) def main(): fileList=loadAllFileWithSuffix(['keywordsMatch']) testForAll(fileList) if __name__=='__main__': main()
最终的结果为:
-
/media/新加卷/小论文实验/data/liweibo/raw_data/人到礼到/人到礼到.title.sample.annotate.keywordsMatch precision:0.87,recall:0.84,f:0.89,accuracy:0.87 /media/新加卷/小论文实验/data/liweibo/raw_data/团圆饭/团圆饭.title.sample.annotate.keywordsMatch precision:0.81,recall:0.98,f:0.79,accuracy:0.81 /media/新加卷/小论文实验/data/liweibo/raw_data/我就这么个人/我就这么个人.title.sample.annotate.keywordsMatch precision:0.94,recall:0.97,f:0.96,accuracy:0.94 /media/新加卷/小论文实验/data/liweibo/raw_data/我的要求不算高/我的要求不算高.title.sample.annotate.keywordsMatch precision:0.91,recall:0.94,f:0.93,accuracy:0.91 /media/新加卷/小论文实验/data/liweibo/raw_data/扶不扶/扶不扶.title.sample.annotate.keywordsMatch precision:0.72,recall:0.69,f:0.81,accuracy:0.72 /media/新加卷/小论文实验/data/liweibo/raw_data/时间都去哪儿/时间都去哪儿.title.sample.annotate.keywordsMatch precision:0.72,recall:0.62,f:0.73,accuracy:0.72 /media/新加卷/小论文实验/data/liweibo/raw_data/说你什么好/说你什么好.title.sample.annotate.keywordsMatch precision:0.93,recall:0.98,f:0.92,accuracy:0.93 所有节目各项目指标的平均值: mean_precision:0.84,mean_recall:0.86,mean_f:0.86,mean_accuracy:0.84