JAVA 爬虫Gecco
主要代码:
1 Gecco(matchUrl="https://github.com/{user}/{project}", pipelines="consolePipeline") 2 public class MyGithub implements HtmlBean { 3 4 private static final long serialVersionUID = -7127412585200687225L; 5 6 @Request 7 private HttpRequest request; 8 9 @RequestParameter("user") 10 private String user; 11 12 @RequestParameter("project") 13 private String project; 14 15 @Text 16 @HtmlField(cssPath=".repository-meta-content") 17 private String title; 18 19 @Text 20 @HtmlField(cssPath=".pagehead-actions li:nth-child(2) .social-count") 21 private int star; 22 23 @Text 24 @HtmlField(cssPath=".pagehead-actions li:nth-child(3) .social-count") 25 private int fork; 26 27 @Href(click=false) 28 @HtmlField(cssPath="ul.numbers-summary > li:nth-child(4) > a") 29 private String contributors; 30 31 @HtmlField(cssPath=".entry-content") 32 private String readme; 33 34 public HttpRequest getRequest() { 35 return request; 36 } 37 38 public void setRequest(HttpRequest request) { 39 this.request = request; 40 } 41 42 public String getReadme() { 43 return readme; 44 } 45 46 public void setReadme(String readme) { 47 this.readme = readme; 48 } 49 50 public String getUser() { 51 return user; 52 } 53 54 public void setUser(String user) { 55 this.user = user; 56 } 57 58 public String getProject() { 59 return project; 60 } 61 62 public void setProject(String project) { 63 this.project = project; 64 } 65 66 public String getTitle() { 67 return title; 68 } 69 70 public void setTitle(String title) { 71 this.title = title; 72 } 73 74 public int getStar() { 75 return star; 76 } 77 78 public void setStar(int star) { 79 this.star = star; 80 } 81 82 public int getFork() { 83 return fork; 84 } 85 86 public void setFork(int fork) { 87 this.fork = fork; 88 } 89 90 public String getContributors() { 91 return contributors; 92 } 93 94 public void setContributors(String contributors) { 95 this.contributors = contributors; 96 } 97 98 public static void main(String[] args) { 99 GeccoEngine.create() 100 .classpath("com.geccocrawler.gecco.demo") 101 //开始抓取的页面地址 102 .start("https://github.com/xtuhcy/gecco") 103 //开启几个爬虫线程,线程数量最好不要大于start request数量 104 .thread(2) 105 //单个爬虫每次抓取完一个请求后的间隔时间 106 .interval(2000) 107 .run(); 108 } 109 110 }
不定期会发布一些实用的Java开发文章