WebMagic爬取北京市政信件内容
我采用创建了Letter类用来储存信件,重写了LetterFilePipeline使得爬取保存的文件名为信件Id,采用了多线程爬取,最后保存到letters目录下
Letter
package org.example.crawler_letter;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public class Letter {
private String originalId;
private String letterType;
private String letterTypeName;
private String letterTitle;
private String showLetterTitle;
private String writeDate;
private String orgNames;
private String showOrgNames;
private String writeName;
private String answerDate;
private String question;
private String answer;
}
LetterFilePipeline
package org.example.crawler_letter;
import com.alibaba.fastjson.JSON;
import lombok.SneakyThrows;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import java.io.FileWriter;
import java.io.PrintWriter;
public class LetterFilePipeline extends JsonFilePipeline {
public LetterFilePipeline(String path) {
super(path);
}
@SneakyThrows
@Override
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
String fileName = resultItems.get("originalId");
PrintWriter printWriter = new PrintWriter(new FileWriter(this.getFile(path + fileName + ".json")));
printWriter.write((JSON.toJSONString(resultItems.getAll())));
printWriter.close();
}
}
LetterProcess
package org.example.crawler_letter;
import org.jsoup.nodes.Document;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
public class LetterProcess implements PageProcessor {
private Site site=new Site();
private static int num=0;
@Override
public void process(Page page) {
System.out.println(page.getUrl());
System.out.println(num++);
String url= String.valueOf(page.getUrl());
page.putField("originalId", url.substring(url.lastIndexOf("=") + 1));
Document doc = page.getHtml().getDocument();
page.putField("letterTitle",doc.select("strong").first().text());
page.putField("writeName",doc.getElementsByClass("col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted").text().substring(4));
page.putField("question",doc.getElementsByClass("col-xs-12 col-md-12 column p-2 text-muted mx-2").text());
page.putField("answer",doc.getElementsByClass("col-xs-12 col-md-12 column p-4 text-muted my-3").text());
page.putField("answerDate",doc.getElementsByClass("col-xs-12 col-sm-3 col-md-3 my-2").text().substring(5));
}
@Override
public Site getSite() {
site.setCharset("UTF-8");
site.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188");
return site;
}
}
LetterMain
package org.example.crawler_letter;
import lombok.SneakyThrows;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.node.ArrayNode;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import us.codecraft.webmagic.Spider;
import java.io.File;
import java.net.URL;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
public class LetterMain {
private static String get_json(String s){
s=s.replace("page:","\"page\":");
s=s.replace("pageNo:","\"pageNo\":");
s=s.replace("totalCount:","\"totalCount\":");
s=s.replace("totalPages:","\"totalPages\":");
s=s.replace("pageSize:","\"pageSize\":");
s=s.replace("result:","\"result\":");
s=s.replace("originalId:","\"originalId\":");
s=s.replace("letterType:","\"letterType\":");
s=s.replace("letterTypeName:","\"letterTypeName\":");
s=s.replace("letterTitle:","\"letterTitle\":");
s=s.replace("showLetterTitle:","\"showLetterTitle\":");
s=s.replace("writeDate:","\"writeDate\":");
s=s.replace("orgNames:","\"orgNames\":");
s=s.replace("showOrgNames:","\"showOrgNames\":");
s=s.replace("\'","\"");
return s;
}
@SneakyThrows
public static void main(String[] args) {
Document start_page = Jsoup.parse(new URL("https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!letterListJson.action?keyword=&startDate=&endDate=&letterType=0&page.pageNo=1&page.pageSize=0&orgtitleLength=26"), 30000);
String json_start=start_page.text();
json_start=get_json(json_start);
ObjectMapper objectMapper=new ObjectMapper();
JsonNode jsonNode=objectMapper.readTree( json_start);
String num= String.valueOf(jsonNode.get("page").get("totalCount")).replace("\"","");
Document end_page=Jsoup.parse(new URL("https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!letterListJson.action?keyword=&startDate=&endDate=&letterType=0&page.pageNo=1&page.pageSize="+num+"&orgtitleLength=26"),3000);
String json_end=end_page.text();
json_end=get_json(json_end);
jsonNode=objectMapper.readTree(json_end);
List<Letter> letters=new ArrayList<>();
ArrayNode arrayNode= (ArrayNode) jsonNode.get("result");
for(JsonNode i:arrayNode){
Letter letter=new Letter();
letter.setOriginalId(i.get("originalId").toString().replace("\"",""));
letter.setLetterType(i.get("letterType").toString().replace("\"",""));
letter.setLetterTypeName(i.get("letterTypeName").toString().replace("\"",""));
letter.setLetterTitle(i.get("letterTitle").toString().replace("\"",""));
letter.setShowLetterTitle(i.get("showLetterTitle").toString().replace("\"",""));
letter.setWriteDate(i.get("writeDate").toString().replace("\"",""));
letter.setOrgNames(i.get("orgNames").toString().replace("\"",""));
letter.setShowOrgNames(i.get("showOrgNames").toString().replace("\"",""));
letters.add(letter);
}
List<String> urlList=new ArrayList<>();
for(Letter i:letters){
if(i.getLetterTypeName().equals("咨询")) {
urlList.add("https://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId="+i.getOriginalId());
}
else {
urlList.add("https://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId="+i.getOriginalId());
}
}
String[] urls=urlList.toArray(new String[0]);
Spider spider = Spider.create(new LetterProcess());
spider.addUrl(urls);
spider.thread(50);
spider.addPipeline(new LetterFilePipeline("D:\\JavaProject\\Lab\\LetterProject\\src\\data\\letter_json"));
spider.run();
letters.sort(new Comparator<Letter>() {
@Override
public int compare(Letter o1, Letter o2) {
return o1.getOriginalId().compareTo(o2.getOriginalId());
}
});
File[] files = new File("D:\\JavaProject\\Lab\\LetterProject\\src\\data\\letter_json\\www.beijing.gov.cn").listFiles();
for(int i=0;i<files.length;i++){
File file=files[i];
Letter letter=objectMapper.readValue(file,Letter.class);
letters.get(i).setLetterTitle(letter.getLetterTitle());
letters.get(i).setWriteName(letter.getWriteName());
letters.get(i).setQuestion(letter.getQuestion());
letters.get(i).setAnswer(letter.getAnswer());
letters.get(i).setAnswerDate(letter.getAnswerDate());
}
for(Letter i:letters){
File outputFile = new File("D:\\JavaProject\\Lab\\LetterProject\\src\\data\\letters\\"+i.getOriginalId()+".json");
objectMapper.writeValue(outputFile, i);
}
}
}