爬虫

 1 import java.io.IOException;
 2 import java.util.Set;
 3 
 4 import org.htmlparser.Node;
 5 import org.htmlparser.Parser;
 6 import org.htmlparser.filters.TagNameFilter;
 7 import org.htmlparser.util.NodeList;
 8 import org.htmlparser.util.ParserException;
 9 
10 
11 public class wikiCrawler {
12      private void initCrawlerWithSeeds(String[] seeds) {
13         for(int i=0;i<seeds.length;i++){
14             LinkQueue.addUnvisitedUrl(seeds[i]);
15         }
16     }
17      public void crawling(String[] seeds) throws IOException, ParserException{
18          initCrawlerWithSeeds(seeds);
19          while(!LinkQueue.unVisitedUrlEmpty()&&LinkQueue.getVisitedUrlNum()<1000){
20              String visitUrl=(String)LinkQueue.unVisitedUrlDequeue();
21              
22              TagNameFilter tagNameFilter=new TagNameFilter("title");
23              
24              
25               DownLoadFile downLoadFile=new DownLoadFile("D://");
26              String filepath=downLoadFile.downloadFile(visitUrl);
27              System.out.println(filepath);
28              if(filepath!=null){
29              String contentString=HtmlContent.getHtml(filepath);
30              
31              NodeList list=new Parser(contentString).extractAllNodesThatMatch(tagNameFilter);
32              String title=((Node)list.elementAt(0)).toPlainTextString();
33              System.out.println(title);
34              LinkQueue.addVisitedUrl(visitUrl);
35              if(contentString!=null){
36                  Set<String> linksSet=WikiParseHtml.extractLinkSet(contentString);
37                  for(Object link:linksSet){
38                      LinkQueue.addUnvisitedUrl((String) link);
39                  }
40              }
41              }
42          }
43      }
44      public static void main(String[] args) throws IOException, ParserException{
45          wikiCrawler crawler=new wikiCrawler();
46          crawler.crawling(new String[]{"http://free0007.iteye.com"});
47      }
48 }

html content

 1 import java.io.BufferedReader;
 2 import java.io.DataOutputStream;
 3 import java.io.File;
 4 import java.io.FileInputStream;
 5 import java.io.FileOutputStream;
 6 import java.io.FileReader;
 7 import java.io.IOException;
 8 import java.io.InputStream;
 9 import java.io.InputStreamReader;
10 import java.io.OutputStream;
11 
12 import org.apache.http.HttpEntity;
13 import org.apache.http.client.ClientProtocolException;
14 import org.apache.http.client.methods.CloseableHttpResponse;
15 import org.apache.http.client.methods.HttpGet;
16 import org.apache.http.impl.client.CloseableHttpClient;
17 import org.apache.http.impl.client.HttpClients;
18 import org.apache.http.util.EntityUtils;
19 
20 
21 public class HtmlContent {
22 
23     public static String getHtml(String filepath) throws IOException {
24         //File file = new File(filepath);
25 
26        /*StringBuffer sb = new StringBuffer();
27        String s ="";
28        BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(filepath),"UTF-8"));
29 
30        while( (s = br.readLine()) != null) {
31          sb.append(s + "\n");
32          }
33 
34        br.close();
35        String str = sb.toString();
36       return str;*/
37 
38         try {  
39             BufferedReader bis = new BufferedReader(new InputStreamReader(new FileInputStream( new File(filepath)),"UTF-8") );  
40             StringBuffer stringBuffer=new StringBuffer(250000);
41             String szTemp;  
42               
43             while ( (szTemp = bis.readLine()) != null) {  
44                 stringBuffer.append(szTemp+"\n");  
45             }  
46             bis.close();  
47             return stringBuffer.toString();  
48         }  
49         catch( Exception e ) {  
50             return "";  
51         }  
52     }
53     /*public static void main(String[] args){
54         try {
55             System.out.print(HtmlContent.getHtml("D://zh.wikipedia.org_wiki_Wikipedia_%E9%A6%96%E9%A1%B5.html"));
56         } catch (IOException e) {
57             System.out.print("error");
58             e.printStackTrace();
59         }
60     }*/
61 }
View Code

downloadfile

 1 import java.io.DataOutputStream;
 2 import java.io.File;
 3 import java.io.FileNotFoundException;
 4 import java.io.FileOutputStream;
 5 import java.io.IOException;
 6 import java.io.InputStream;
 7 import java.io.OutputStreamWriter;
 8 
 9 import org.apache.http.Header;
10 import org.apache.http.HttpEntity;
11 import org.apache.http.HttpStatus;
12 import org.apache.http.client.ClientProtocolException;
13 import org.apache.http.client.HttpClient;
14 import org.apache.http.client.config.RequestConfig;
15 import org.apache.http.client.config.RequestConfig.Builder;
16 import org.apache.http.client.methods.CloseableHttpResponse;
17 import org.apache.http.client.methods.HttpGet;
18 import org.apache.http.impl.client.CloseableHttpClient;
19 import org.apache.http.impl.client.HttpClients;
20 import org.apache.http.util.EntityUtils;
21 
22 
23 public class DownLoadFile {
24     private String filepath="";
25     public String getFileNameByUrl(String url, String contentType){
26         url=url.substring(7);
27         if(contentType.indexOf("html")!=-1){
28             url=url.replaceAll("[\\?/:*|<>\"]", "_")+".html";
29             return url;
30         }
31         //application/pdf
32         else{
33             return url.replaceAll("[\\?/:*|<>\"]", "_")+contentType.substring(contentType.indexOf("/")+1);
34         }
35     }
36     public DownLoadFile(String filepath){
37         this.filepath=filepath;
38     }
39     private void saveToLocal(InputStream is,String filePath) throws IOException{
40         try {
41             DataOutputStream outputStream=new DataOutputStream(new FileOutputStream(new File(filePath)));
42             int len=0;
43             byte[] buffer=new byte[1024];
44             while((len=is.read(buffer))!=-1){
45                 outputStream.write(buffer, 0, len);
46             }
47             outputStream.flush();
48             outputStream.close();
49         } catch (FileNotFoundException e) {
50             
51             e.printStackTrace();
52         }
53     }
54     public String downloadFile(String url) throws IOException{
55         String filePathString=null;
56         CloseableHttpClient httpClient=HttpClients.createDefault();
57         HttpGet httpGet=new HttpGet(url);
58         RequestConfig requestConfig=RequestConfig.copy(RequestConfig.custom().build()).setConnectTimeout(5000).build();
59         httpGet.setConfig(requestConfig);
60         CloseableHttpResponse response=httpClient.execute(httpGet);
61         try {
62             String statusCode=response.getStatusLine().toString();
63             if(Integer.parseInt(statusCode.split(" ")[1])!=HttpStatus.SC_OK){
64                 System.err.println(url+" Failed:"+statusCode);
65                 filePathString=null;
66             }
67             else {
68                 HttpEntity entity=response.getEntity();
69                 InputStream input= entity.getContent();
70                 Header header= entity.getContentType();
71                 filePathString=filepath+getFileNameByUrl(url, header.getValue());
72                 
73                 saveToLocal(input, filePathString);
74                 
75                 
76             }
77 
78         } catch (ClientProtocolException e) {
79             // TODO Auto-generated catch block
80             e.printStackTrace();
81         } catch (IOException e) {
82             // TODO Auto-generated catch block
83             e.printStackTrace();
84         }finally{
85             response.close();
86             httpClient.close();
87         }
88         
89         return filePathString;
90     }
91 
92     
93 }
View Code

wikiparserhtml

 1 import java.util.ArrayList;
 2 import java.util.HashSet;
 3 import java.util.Set;
 4 
 5 import org.htmlparser.Node;
 6 import org.htmlparser.NodeFilter;
 7 import org.htmlparser.Parser;
 8 import org.htmlparser.filters.NodeClassFilter;
 9 import org.htmlparser.tags.LinkTag;
10 import org.htmlparser.util.NodeList;
11 
12 
13 public class WikiParseHtml {
14     public static Set<String> extractLinkSet(String content){
15         Set<String> linksSet=new HashSet<String>();
16         try {
17             Parser parser= Parser.createParser(content, "utf-8");
18             NodeClassFilter nodeClassFilter=new NodeClassFilter(LinkTag.class);
19             NodeList list=parser.extractAllNodesThatMatch(nodeClassFilter);
20             for(int i=0;i<list.size();i++){
21                 Node tagNode=list.elementAt(i);
22                 if(tagNode instanceof LinkTag){
23                     LinkTag linkTag=(LinkTag)tagNode;
24                     String urlString=linkTag.getLink();
25                     if(urlString.startsWith("http://")){
26                         linksSet.add(urlString);
27                     }
28                 }
29             }
30         } catch (Exception e) {
31             e.printStackTrace();
32         }
33         return linksSet;
34         
35     }
36 }
View Code

queue

 1 import java.util.LinkedList;
 2 
 3 
 4 public class Queue {
 5     private LinkedList<Object> queue=new LinkedList<Object>();
 6     public void enQueue(Object t){
 7         queue.addLast(t);
 8     }
 9     public Object deQueue(){
10         return queue.removeFirst();
11     }
12     public boolean isQueueEmpty(){
13         return queue.isEmpty();
14     }
15     public boolean contains(Object t){
16         return queue.contains(t);
17     }
18     public boolean empty(){
19         return queue.isEmpty();
20     }
21 }
View Code

linkqueue

 1 import java.util.HashSet;
 2 import java.util.Set;
 3 
 4 
 5 public class LinkQueue {
 6     private static Set visitedUrl=new HashSet();
 7     private static Queue unvisitedUrl=new Queue();
 8     public static Queue getUnvisitedUrl(){
 9         return unvisitedUrl;
10     }
11     public static void addVisitedUrl(String url){
12         visitedUrl.add(url);
13     }
14     public static void removeVisitedUrl(String url){
15         visitedUrl.remove(url);
16     }
17     public static Object unVisitedUrlDequeue(){
18         return unvisitedUrl.deQueue();
19     }
20     public static void addUnvisitedUrl(String url){
21         if(url!=null||!url.trim().equals("")||!visitedUrl.contains(url)||!unvisitedUrl.contains(url))
22             unvisitedUrl.enQueue(url);
23     }
24     public static int getVisitedUrlNum(){
25         return visitedUrl.size();
26     }
27     public static boolean unVisitedUrlEmpty(){
28         return unvisitedUrl.isQueueEmpty();
29     }
30 }
View Code

 

posted @ 2014-10-02 18:15  i wish i  阅读(281)  评论(0编辑  收藏  举报