Android 学习心得(7)——读取网页源码+网页代码提取

最近这几天学习Android 的抓取网页新闻 

拜读了这位大牛的文章:http://blog.csdn.net/lmj623565791/article/details/23532797

参考这位大牛的源码试着自己书写了一下结果惨不忍睹,由于是学习我也就忍了

虽然失败了,但是东西还是学习到了(失败原因是因为csdn对网页做了处理 我只能抓取到废页面)

首先   读取网页源码

 1 public String doGet(String arg0,final int newsType) throws Exception{
 2         //可变比特缓存
 3         final ByteArrayOutputStream out = new ByteArrayOutputStream();
 4         //地址URL
 5         URL url=null;
 6         //比特流
 7         try {
 8             url= new URL(arg0);
 9             //打开网络链接
10             HttpURLConnection conn = (HttpURLConnection) url.openConnection();
11             //使用get方式读取数据
12             conn.setRequestMethod("GET");
13             //设置超时5秒
14             conn.setConnectTimeout(5*1000);
15             //设置允许读写POST请求必须输入这两行
16             conn.setDoInput(true);
17             conn.setDoOutput(true);
18             int flag= conn.getResponseCode();
19             if(flag==HttpURLConnection.HTTP_OK){//如果返回200 则说明请求成功
20                 //取得输入流
21                 InputStream in = conn.getInputStream();
22                 byte[] data = readStream(in);
23                 //新建一个线程用于存储二进制数据进入文件
24 //                new Thread(){
25 //                    public void run(){
26 //                        ByteArrayOutputStreamWrite(in,newsType);
27 //                    }
28 //                }.start();
29                 String html = new String(data);
30                 return html;
31             }else{
32                  throw new Exception("访问网络失败!");
33             }
34         } catch (MalformedURLException e) {
35             throw new Exception("解析网络地址失败!");
36         } catch (IOException e) {
37             throw new Exception("读取数据失败!");
38         }
39     }
40 
41 
42     
43     private byte[] readStream(InputStream in) throws IOException {
44         ByteArrayOutputStream out  =new ByteArrayOutputStream(); 
45         //把流存入缓存中
46         int len = -1;
47         byte[] bs = new byte[1024];
48         while((len=in.read(bs))!=-1){
49             out.write(bs, 0, len);
50         }
51         in.close();
52         out.close();
53         return out.toByteArray();
54     }

接着  解析源代码从中抽取有用信息

 1 /**
 2      * 网页解析
 3      * @param data 网页源代码
 4      * @return 数据数组
 5      * @throws IOException 
 6      */
 7     public  List<Data> dataFilter(String data,int newsType) throws IOException {
 8         List<Data> list = new ArrayList<Data>();
 9         Document doc=  Jsoup.parse(data);
10         Elements units = doc.getElementsByClass("unit");
11         for(int i=0;i<units.size();i++){
12             Data thisdata=new Data();
13             Element unit = units.get(i);
14             //取得h1标签的内容
15             Element h1_ele=unit.getElementsByTag("h1").get(0);
16             //取得h1中<a>标签的内容
17             Element h1_a_ele = h1_ele.child(0);
18             //转换成文本
19             String title = h1_a_ele.text();
20             //取得<a>的harf属性作为链接
21             String url = h1_a_ele.attr("href");
22             
23             Element h4_ele=unit.getElementsByTag("h4").get(0);
24             Element h4_time_ele = h4_ele.child(0);
25             String date=h4_time_ele.text();
26             Element dl_ele = unit.getElementsByTag("dl").get(0);
27             Element dt_ele = dl_ele.child(0);
28             //获取图片地址和图片链接
29             try{
30             String imgUrl = dt_ele.child(1).attr("src");
31             String imgLinkUrl = dt_ele.child(0).attr("href");
32             thisdata.setImgLinkUrl(imgLinkUrl);
33             thisdata.setImgUrl(imgUrl);
34             }catch(Exception e){
35                 
36             }
37             Element dd_ele = dl_ele.child(1);
38             String content = dd_ele.text();
39             //放入data对象
40             thisdata.setContent(content);
41             thisdata.setTitle(title);
42             thisdata.setUrl(url);
43             thisdata.setDate(date);
44             thisdata.setType(newsType);
45             
46             list.add(thisdata);
47         }
48         return list;
49     }

 

posted @ 2015-04-23 09:46  飘0  阅读(1262)  评论(0编辑  收藏  举报