WebMagic爬取北京市政信件内容

我采用创建了Letter类用来储存信件,重写了LetterFilePipeline使得爬取保存的文件名为信件Id,采用了多线程爬取,最后保存到letters目录下

Letter

package org.example.crawler_letter;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

@Data
@AllArgsConstructor
@NoArgsConstructor
public class Letter {
    private String originalId;
    private String letterType;
    private String letterTypeName;
    private String letterTitle;
    private String showLetterTitle;
    private String writeDate;
    private String orgNames;
    private String showOrgNames;
    private String writeName;
    private String answerDate;
    private String question;
    private String answer;
}

LetterFilePipeline

package org.example.crawler_letter;

import com.alibaba.fastjson.JSON;
import lombok.SneakyThrows;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import java.io.FileWriter;
import java.io.PrintWriter;

public class LetterFilePipeline extends JsonFilePipeline {
    public LetterFilePipeline(String path) {
        super(path);
    }

    @SneakyThrows
    @Override
    public void process(ResultItems resultItems, Task task) {
        String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
        String fileName = resultItems.get("originalId");
        PrintWriter printWriter = new PrintWriter(new FileWriter(this.getFile(path + fileName + ".json")));
        printWriter.write((JSON.toJSONString(resultItems.getAll())));
        printWriter.close();
    }
}

LetterProcess

package org.example.crawler_letter;

import org.jsoup.nodes.Document;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

public class LetterProcess implements PageProcessor {
    private Site site=new Site();
    private static int num=0;
    @Override
    public void process(Page page) {
        System.out.println(page.getUrl());
        System.out.println(num++);
        String url= String.valueOf(page.getUrl());
        page.putField("originalId", url.substring(url.lastIndexOf("=") + 1));
        Document doc = page.getHtml().getDocument();
        page.putField("letterTitle",doc.select("strong").first().text());
        page.putField("writeName",doc.getElementsByClass("col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted").text().substring(4));
        page.putField("question",doc.getElementsByClass("col-xs-12 col-md-12 column p-2 text-muted mx-2").text());
        page.putField("answer",doc.getElementsByClass("col-xs-12 col-md-12 column p-4 text-muted my-3").text());
        page.putField("answerDate",doc.getElementsByClass("col-xs-12 col-sm-3 col-md-3 my-2").text().substring(5));
    }

    @Override
    public Site getSite() {
        site.setCharset("UTF-8");
        site.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188");
        return site;
    }
}

LetterMain

package org.example.crawler_letter;

import lombok.SneakyThrows;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.node.ArrayNode;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import us.codecraft.webmagic.Spider;

import java.io.File;
import java.net.URL;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;

public class LetterMain {

    private static String get_json(String s){
        s=s.replace("page:","\"page\":");
        s=s.replace("pageNo:","\"pageNo\":");
        s=s.replace("totalCount:","\"totalCount\":");
        s=s.replace("totalPages:","\"totalPages\":");
        s=s.replace("pageSize:","\"pageSize\":");
        s=s.replace("result:","\"result\":");
        s=s.replace("originalId:","\"originalId\":");
        s=s.replace("letterType:","\"letterType\":");
        s=s.replace("letterTypeName:","\"letterTypeName\":");
        s=s.replace("letterTitle:","\"letterTitle\":");
        s=s.replace("showLetterTitle:","\"showLetterTitle\":");
        s=s.replace("writeDate:","\"writeDate\":");
        s=s.replace("orgNames:","\"orgNames\":");
        s=s.replace("showOrgNames:","\"showOrgNames\":");
        s=s.replace("\'","\"");
        return s;
    }
    @SneakyThrows
    public static void main(String[] args) {
        Document start_page = Jsoup.parse(new URL("https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!letterListJson.action?keyword=&startDate=&endDate=&letterType=0&page.pageNo=1&page.pageSize=0&orgtitleLength=26"), 30000);
        String json_start=start_page.text();
        json_start=get_json(json_start);
        ObjectMapper objectMapper=new ObjectMapper();
        JsonNode jsonNode=objectMapper.readTree( json_start);
        String num= String.valueOf(jsonNode.get("page").get("totalCount")).replace("\"","");
        Document end_page=Jsoup.parse(new URL("https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!letterListJson.action?keyword=&startDate=&endDate=&letterType=0&page.pageNo=1&page.pageSize="+num+"&orgtitleLength=26"),3000);
        String json_end=end_page.text();
        json_end=get_json(json_end);
        jsonNode=objectMapper.readTree(json_end);
        List<Letter> letters=new ArrayList<>();
        ArrayNode arrayNode= (ArrayNode) jsonNode.get("result");
        for(JsonNode i:arrayNode){
            Letter letter=new Letter();
            letter.setOriginalId(i.get("originalId").toString().replace("\"",""));
            letter.setLetterType(i.get("letterType").toString().replace("\"",""));
            letter.setLetterTypeName(i.get("letterTypeName").toString().replace("\"",""));
            letter.setLetterTitle(i.get("letterTitle").toString().replace("\"",""));
            letter.setShowLetterTitle(i.get("showLetterTitle").toString().replace("\"",""));
            letter.setWriteDate(i.get("writeDate").toString().replace("\"",""));
            letter.setOrgNames(i.get("orgNames").toString().replace("\"",""));
            letter.setShowOrgNames(i.get("showOrgNames").toString().replace("\"",""));
            letters.add(letter);
        }
        List<String> urlList=new ArrayList<>();
        for(Letter i:letters){
            if(i.getLetterTypeName().equals("咨询")) {
                urlList.add("https://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId="+i.getOriginalId());
            }
            else {
                urlList.add("https://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId="+i.getOriginalId());
            }
        }
        String[] urls=urlList.toArray(new String[0]);
        Spider spider = Spider.create(new LetterProcess());
        spider.addUrl(urls);
        spider.thread(50);
        spider.addPipeline(new LetterFilePipeline("D:\\JavaProject\\Lab\\LetterProject\\src\\data\\letter_json"));
        spider.run();
        letters.sort(new Comparator<Letter>() {
            @Override
            public int compare(Letter o1, Letter o2) {
                return o1.getOriginalId().compareTo(o2.getOriginalId());
            }
        });
        File[]  files = new File("D:\\JavaProject\\Lab\\LetterProject\\src\\data\\letter_json\\www.beijing.gov.cn").listFiles();
        for(int i=0;i<files.length;i++){
            File file=files[i];
            Letter letter=objectMapper.readValue(file,Letter.class);
            letters.get(i).setLetterTitle(letter.getLetterTitle());
            letters.get(i).setWriteName(letter.getWriteName());
            letters.get(i).setQuestion(letter.getQuestion());
            letters.get(i).setAnswer(letter.getAnswer());
            letters.get(i).setAnswerDate(letter.getAnswerDate());
        }
        for(Letter i:letters){
            File outputFile = new File("D:\\JavaProject\\Lab\\LetterProject\\src\\data\\letters\\"+i.getOriginalId()+".json");
            objectMapper.writeValue(outputFile, i);
        }
    }
}

 

posted @ 2023-08-04 22:14  突破铁皮  阅读(8)  评论(0编辑  收藏  举报