博客园博文爬取 标签爬取(含源代码)
爬取思路:
1,在首页上爬取这些推荐博文:https://www.cnblogs.com/
2,根据这些推荐博文进一步到发布这些推荐博文的博主主页中:
3,爬取标签的话可以查看这些博主的标签 只用在博主主页后加一个/tag/就可以跳转到标签页中
4,如果要爬取内容的话,就可以进入这些博主的所有页面中进行爬取
下面是我的代码:
1 package use; 2 3 import java.sql.Connection; 4 import java.sql.PreparedStatement; 5 import java.util.ArrayList; 6 import java.util.Date; 7 import java.util.List; 8 9 import com.dao.ClarifyDao; 10 import com.dao.InfoDao; 11 import org.jsoup.Jsoup; 12 import org.jsoup.nodes.Document; 13 14 import us.codecraft.webmagic.Page; 15 import us.codecraft.webmagic.Site; 16 import us.codecraft.webmagic.Spider; 17 import us.codecraft.webmagic.processor.PageProcessor; 18 19 public class 博客园内容 implements PageProcessor { 20 static int nn=0; 21 static String regEx="[\n`'' ]"; 22 // static String regEx="[\n`~!@#$%^&()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。, 、?? ]"; 23 static String aa = "";//这里是将特殊字符换为aa字符串," "代表直接去掉 24 private static Connection conn = null; 25 26 private static PreparedStatement ps = null; 27 // 标题和链接获取 28 29 private static String TITLEQUERY = "div.post_item_body h3 a.titlelnk"; 30 31 private static String TITLE = "div.post h1 a.postTitle2"; 32 // 作者 33 34 private static String AUTHORQUERY = "div.post_item_foot a.lightblue "; 35 36 37 //初始化带爬取网页地址 38 private static List<String> urls() { 39 List listUrl=new ArrayList<String>(); 40 for(int i=1;i<=200;i++) { 41 listUrl.add("https://www.cnblogs.com/sitehome/p/"+i); 42 43 } 44 listUrl.toArray(new String[listUrl.size()]); 45 return listUrl; 46 } 47 private static void add_urls_child(Page page) { 48 List listUrl=new ArrayList<String>(); 49 listUrl= page.getHtml().xpath("//*[@id=\"post_list\"]//*/div[2]/div/a//@href").all(); 50 51 listUrl.toArray(new String[listUrl.size()]); 52 page.addTargetRequests(listUrl); 53 54 } 55 56 private static void add_urls_child_page(Page page) { 57 List listUrl=new ArrayList<String>(); 58 listUrl= page.getHtml().xpath("//div[@class=\"postTitle\"]/a//@href").all(); 59 60 listUrl.toArray(new String[listUrl.size()]); 61 page.addTargetRequests(listUrl); 62 63 } 64 65 //jsoup根据html字符串和语法来获取内容 66 private static String selectDocumentText(String htmlText,String Query) { 67 Document doc=Jsoup.parse(htmlText); 68 String select=doc.select(Query).text(); 69 return select; 70 } 71 72 //jsoup根据html字符串和语法获取链接地址 73 private static String selectDocumentLink(String htmlText,String Query) { 74 Document doc=Jsoup.parse(htmlText); 75 String select=doc.select(Query).attr("href"); 76 return select; 77 } 78 79 @Override 80 public Site getSite() { 81 return Site.me().setSleepTime(1000).setRetryTimes(10); 82 } 83 84 //编写抽取逻辑 85 @Override 86 public void process(Page page) { 87 nn=nn+1; 88 if(nn==1) 89 { 90 System.out.println("TTTTTTTTTTTTT"); 91 page.addTargetRequests(urls()); 92 } 93 94 String str = page.getUrl().get(); 95 96 if(str.matches("https://www.cnblogs.com/sitehome/p/[0-9]+")) 97 { 98 System.out.println("AAAAA"); 99 add_urls_child(page); 100 } 101 else if(str.matches("https://www.cnblogs.com/[A-Za-z0-9_-]+/")) 102 { 103 System.out.println("BBBBBBB"); 104 add_urls_child_page(page); 105 }else 106 { 107 System.out.println("DDDDDD"); 108 109 String title=page.getHtml().xpath("//*[@id='cb_post_title_url']//text()").get(); 110 111 String URL=page.getUrl().get(); 112 113 114 115 String author=page.getHtml().xpath("//*[@id='Header1_HeaderTitle']//text()").get(); 116 List<String> values=new ArrayList<String>(); 117 values=page.getHtml().xpath("//*[@id='cnblogs_post_body']//*//text()").all(); 118 String info=""; 119 for(String value:values) 120 { 121 info+=value; 122 } 123 info=info.replaceAll(regEx, aa); 124 System.out.println("Title:\t"+title); 125 System.out.println("AUTHOR:\t"+author); 126 System.out.println( "VALUE:\t"+info); 127 System.out.println("URL:\t"+URL); 128 ClarifyDao.add("blog_info","",title,author,info,URL); 129 130 } 131 132 133 134 135 /* 136 //定义如何抽取页面信息 137 138 List<String> htmls=page.getHtml().xpath("//div[@class='post_item']/html()").all(); 139 140 // List<JavaBokeModel> javaBokes=new ArrayList<JavaBokeModel>(); 141 for(String html:htmls) { 142 // JavaBokeModel javaBoke=new JavaBokeModel(); 143 //标题和链接 144 String title=selectDocumentText(html,TITLEQUERY); 145 146 String linke=selectDocumentLink(html,TITLEQUERY); 147 //作者和作者主页 148 String author=selectDocumentText(html,AUTHORQUERY); 149 150 System.out.println( 151 "TITLE\t"+title+ 152 "Link\t"+linke+ 153 "Author\t"+author 154 ); 155 156 157 158 } 159 */ 160 //File.WriteStringToFile2(javaBokes); 161 162 163 } 164 165 public static void main(String[] args) { 166 long startTime,endTime; 167 //DBUtil.getConnection(); 168 startTime=new Date().getTime(); 169 InfoDao.delete("blog_info"); 170 Spider create=Spider.create(new 博客园内容()); 171 create.addUrl("https://www.cnblogs.com/").thread(5).run(); 172 try { 173 ps.close(); 174 conn.close(); 175 }catch(Exception e) { 176 177 } 178 endTime=new Date().getTime(); 179 System.out.println("用时为:"+(endTime-startTime)/1000+"s"); 180 181 } 182 183 }
1 package use; 2 3 import java.sql.Connection; 4 import java.sql.PreparedStatement; 5 import java.util.ArrayList; 6 import java.util.Date; 7 import java.util.List; 8 9 import com.dao.InfoDao; 10 import org.jsoup.Jsoup; 11 import org.jsoup.nodes.Document; 12 13 import us.codecraft.webmagic.Page; 14 import us.codecraft.webmagic.Site; 15 import us.codecraft.webmagic.Spider; 16 import us.codecraft.webmagic.processor.PageProcessor; 17 18 public class 博客园标签 implements PageProcessor { 19 static int nn=0; 20 private static Connection conn = null; 21 22 private static PreparedStatement ps = null; 23 // 标题和链接获取 24 25 private static String TITLEQUERY = "div.post_item_body h3 a.titlelnk"; 26 27 private static String TITLE = "div.post h1 a.postTitle2"; 28 // 作者 29 30 private static String AUTHORQUERY = "div.post_item_foot a.lightblue "; 31 32 33 //初始化带爬取网页地址 34 private static List<String> urls() { 35 List listUrl=new ArrayList<String>(); 36 for(int i=2;i<=200;i++) { 37 listUrl.add("https://www.cnblogs.com/sitehome/p/"+i); 38 39 } 40 listUrl.toArray(new String[listUrl.size()]); 41 return listUrl; 42 } 43 private static void add_urls_child(Page page) { 44 List listUrl=new ArrayList<String>(); 45 List<String> Urls=new ArrayList<String>(); 46 Urls= page.getHtml().xpath("//*[@id=\"post_list\"]//*/div[2]/div/a//@href").all(); 47 48 for(String ur:Urls) 49 { 50 ur+="tag/"; 51 listUrl.add(ur); 52 } 53 listUrl.toArray(new String[listUrl.size()]); 54 page.addTargetRequests(listUrl); 55 56 } 57 58 //jsoup根据html字符串和语法来获取内容 59 private static String selectDocumentText(String htmlText,String Query) { 60 Document doc=Jsoup.parse(htmlText); 61 String select=doc.select(Query).text(); 62 return select; 63 } 64 65 //jsoup根据html字符串和语法获取链接地址 66 private static String selectDocumentLink(String htmlText,String Query) { 67 Document doc=Jsoup.parse(htmlText); 68 String select=doc.select(Query).attr("href"); 69 return select; 70 } 71 72 @Override 73 public Site getSite() { 74 return Site.me().setSleepTime(1000).setRetryTimes(10); 75 } 76 77 //编写抽取逻辑 78 @Override 79 public void process(Page page) { 80 nn=nn+1; 81 if(nn==1) 82 { 83 page.addTargetRequests(urls()); 84 } 85 if(page.getUrl().regex("https://www.cnblogs.com/sitehome/p/[0-9]+").match()) 86 { 87 add_urls_child(page); 88 } 89 90 else 91 { 92 System.out.println("DDDDDD"); 93 94 String title=page.getHtml().xpath("//*[@id=\"Header1_HeaderTitle\"]//text()").get(); 95 String URL=page.getUrl().get(); 96 System.out.println("Title:\t"+title); 97 System.out.println("URL:\t"+URL); 98 List<String> tags=new ArrayList<String>(); 99 tags=page.getHtml().xpath("//*[@id=\"MyTag1_dtTagList\"]/tbody//a//text()").all(); 100 for(String tag:tags) 101 { 102 System.out.println( 103 "TAG:\t"+tag 104 ); 105 InfoDao.add("blog",tag,title,URL); 106 } 107 108 109 } 110 111 112 113 114 /* 115 //定义如何抽取页面信息 116 117 List<String> htmls=page.getHtml().xpath("//div[@class='post_item']/html()").all(); 118 119 // List<JavaBokeModel> javaBokes=new ArrayList<JavaBokeModel>(); 120 for(String html:htmls) { 121 // JavaBokeModel javaBoke=new JavaBokeModel(); 122 //标题和链接 123 String title=selectDocumentText(html,TITLEQUERY); 124 125 String linke=selectDocumentLink(html,TITLEQUERY); 126 //作者和作者主页 127 String author=selectDocumentText(html,AUTHORQUERY); 128 129 System.out.println( 130 "TITLE\t"+title+ 131 "Link\t"+linke+ 132 "Author\t"+author 133 ); 134 135 136 137 } 138 */ 139 //File.WriteStringToFile2(javaBokes); 140 141 142 } 143 144 public static void main(String[] args) { 145 long startTime,endTime; 146 //DBUtil.getConnection(); 147 startTime=new Date().getTime(); 148 149 Spider create=Spider.create(new 博客园标签()); 150 create.addUrl("http://www.cnblogs.com/").thread(5).run(); 151 try { 152 ps.close(); 153 conn.close(); 154 }catch(Exception e) { 155 156 } 157 endTime=new Date().getTime(); 158 System.out.println("用时为:"+(endTime-startTime)/1000+"s"); 159 160 } 161 162 }