百度百科爬虫

WebMagic的配置:http://www.cnblogs.com/justcooooode/p/7913365.html

爬取目标

最近需要对一些领域概念做分析,选择利用百度百科爬取对应的词条,从中获取信息和知识。

我将查询结果分为两类,一种是百科中已收录,另一种就是未被收录,虽然未被收录但还是能从中获取一些信息。

对于未被收录的词条,会给出与其相关的若干词条,所以这一部分也作为爬取内容。

而对于已收录的词条,我需要的信息是标题同义词摘要Infobox百度知心内容

代码

  1 /** 
  2 * @Title: BaiduBaikeSpider.java
  3 * @Package: edu.heu.kg.spider
  4 * @Description: TODO
  5 * @author:LJH
  6 * @date:2017年12月1日 上午9:43:16
  7 */
  8 package edu.heu.kg.spider;
  9 
 10 import java.io.BufferedReader;
 11 import java.io.BufferedWriter;
 12 import java.io.FileOutputStream;
 13 import java.io.FileReader;
 14 import java.io.IOException;
 15 import java.io.OutputStreamWriter;
 16 import java.io.UnsupportedEncodingException;
 17 import java.util.LinkedHashMap;
 18 import java.util.LinkedList;
 19 import java.util.List;
 20 import java.util.Map;
 21 import java.util.regex.Matcher;
 22 import java.util.regex.Pattern;
 23 
 24 import us.codecraft.webmagic.Page;
 25 import us.codecraft.webmagic.Site;
 26 import us.codecraft.webmagic.Spider;
 27 import us.codecraft.webmagic.pipeline.ConsolePipeline;
 28 import us.codecraft.webmagic.processor.PageProcessor;
 29 import us.codecraft.webmagic.selector.Html;
 30 import us.codecraft.webmagic.selector.JsonPathSelector;
 31 
 32 public class BaiduBaikeSpider implements PageProcessor {
 33 
 34     // 抓取配置
 35     private Site site = Site.me().setRetryTimes(3).setSleepTime(500).setCharset("UTF-8").setUserAgent(
 36             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36");
 37     // 检索词
 38     private static String word;
 39     // 存储路经
 40     private static final String PATH = "D:\\百科词条\\百度百科\\";
 41 
 42     @Override
 43     public Site getSite() {
 44         return site;
 45     }
 46 
 47     @Override
 48     public void process(Page page) {
 49 
 50         if (page.getUrl().regex("https://baike.baidu.com/item/.*").match()) {
 51 
 52             Html html = page.getHtml();
 53 
 54             if (html.xpath("/html/body/div[3]/div[2]/div/div[2]/dl[1]/dd/h1/text()").match()) {
 55 
 56                 // 标题
 57                 String title = html.xpath("/html/body/div[3]/div[2]/div/div[2]/dl[1]/dd/h1/text()").toString();
 58                 // 同义词
 59                 String synonym = html.xpath("/html/body/div[3]/div[2]/div/div[2]/span/span/text()").all().toString();
 60                 // 摘要
 61                 String summary = html.xpath("/html/body/div[3]/div[2]/div/div[2]/div[@class='lemma-summary']/allText()")
 62                         .all().toString().replaceAll(",", "");
 63                 // 信息框
 64                 String infobox = html
 65                         .xpath("/html/body/div[3]/div[2]/div/div[2]/div[@class='basic-info cmn-clearfix']/dl/allText()")
 66                         .all().toString();
 67 
 68                 StringBuilder sb = new StringBuilder();
 69                 sb.append(word + "\t" + title + "\t" + synonym + "\t" + summary + "\t" + infobox + "\n");
 70                 try {
 71                     outPut(PATH + "百科词条.txt", sb.toString());
 72                 } catch (IOException e) {
 73                     e.printStackTrace();
 74                 }
 75 
 76                 // 百度知心,由于是动态加载,先获取id
 77                 String zhixinID = html.xpath("//*[@id=\"zhixinWrap\"]/@data-newlemmaid").toString();
 78                 String zhixinURL = "https://baike.baidu.com/wikiui/api/zhixinmap?lemmaId=" + zhixinID;
 79                 // 添加任务
 80                 page.addTargetRequest(zhixinURL);
 81 
 82             }
 83             // 如果不存在词条则获取最相关的词条,添加url到任务
 84             else {
 85                 page.addTargetRequest("https://baike.baidu.com/search/none?word=" + word);
 86             }
 87         }
 88         // 解析百度知心Json
 89         else if (page.getUrl().regex("https://baike.baidu.com/wikiui/.*").match()) {
 90 
 91             Map<String, List<String>> resultMap = new LinkedHashMap<>();
 92             // 获取相关类型的数量,因为有的词是两个,有些是三个
 93             List<String> tempList = new JsonPathSelector("$.[*]").selectList(page.getRawText());
 94             int typeNums = tempList.size();
 95             int count = typeNums;
 96             while (count > 0) {
 97                 resultMap.put(new JsonPathSelector("$.[" + (typeNums - count) + "].tipTitle").select(page.getRawText()),
 98                         new JsonPathSelector("$.[" + (typeNums - count) + "].data[*].title")
 99                                 .selectList(page.getRawText()));
100                 count--;
101             }
102 
103             StringBuilder sb = new StringBuilder();
104             sb.append(word + "\t");
105             resultMap.forEach((key, value) -> {
106                 sb.append(key + ":" + value + "\t");
107             });
108             sb.append("\n");
109             try {
110                 outPut(PATH + "相关词条_知心.txt", sb.toString());
111             } catch (IOException e) {
112                 e.printStackTrace();
113             }
114         }
115         // 百度百科尚未收录词条,获取相关词条
116         else if (page.getUrl().regex("https://baike.baidu.com/search/none\\?word=.*").match()) {
117 
118             List<String> list = page.getHtml().xpath("//*[@id=\"body_wrapper\"]/div[1]/dl/dd/a/@href").all();
119             try {
120                 list = UrlDecode(list);
121             } catch (UnsupportedEncodingException e) {
122                 e.printStackTrace();
123             }
124 
125             StringBuilder sb = new StringBuilder();
126             sb.append(word + "\t" + list + "\n");
127             try {
128                 outPut(PATH + "相关词条_未收录.txt", sb.toString());
129             } catch (IOException e) {
130                 // TODO Auto-generated catch block
131                 e.printStackTrace();
132             }
133 
134         } else
135             System.out.println("Nothing!");
136 
137     }
138 
139     // URL解码
140     @SuppressWarnings("unused")
141     private static List<String> UrlDecode(List<String> rawList) throws UnsupportedEncodingException {
142 
143         List<String> resultList = new LinkedList<>();
144         String reg = "https://baike.baidu.com/item/(.*)/\\d+";
145         Pattern p = Pattern.compile(reg);
146         Matcher m;
147         for (String str : rawList) {
148             m = p.matcher(str);
149             if (m.find()) {
150                 resultList.add(java.net.URLDecoder.decode(m.group(1), "UTF-8"));
151             }
152         }
153         return resultList;
154 
155     }
156 
157     // 存储
158     private static void outPut(String path, String content) throws IOException {
159 
160         FileOutputStream fos = new FileOutputStream(path, true);
161         OutputStreamWriter osw = new OutputStreamWriter(fos);
162         BufferedWriter bw = new BufferedWriter(osw);
163         bw.write(content);
164         bw.close();
165 
166     }
167 
168     public static void main(String[] args) throws IOException {
169 
170         try {
171             BufferedReader br = new BufferedReader(new FileReader("D:\\nba.txt"));
172             while ((word = br.readLine()) != null) {
173                 // 创建Spider,addUrl的参数可以为可变参数,但是会有问题
174                 Spider.create(new BaiduBaikeSpider()).addPipeline(new ConsolePipeline())
175                         .addUrl("https://baike.baidu.com/item/" + word).run();
176             }
177             br.close();
178         } catch (IOException e1) {
179             e1.printStackTrace();
180         }
181 
182     }
183 
184 }

结果

词条的基本信息:

 词条相关信息:

转载请注明原文链接:http://www.cnblogs.com/justcooooode/p/8068045.html


 

posted @ 2017-12-21 09:46  没课割绿地  阅读(1854)  评论(0编辑  收藏  举报