java开发爬虫Deno

java开发爬虫Deno

    身为一个程序员不会两三手爬虫怎么能在行业里立足啊,这是开发中自己写的一个java爬虫的Demo,供大家参考。

    java爬虫的开发依赖于jsoup.jar

    直接上代码

public static void main(String[] args) {
		HttpClient client = new DefaultHttpClient();
		HttpGet httpGet = new HttpGet();
		Map map = new HashMap();
		try {
			//百度百科999感冒灵连接
			String url = "http://baike.baidu.com/link?url=c95Y4QJym_d_wFKGmcibRTI_KIyj-X_tOjnlOGJS9qekgO1tmWaWnrn7QyAjqvZX8At7LbI1XIa69IBZWejiCXDVM0jkSBMnVZKKu4jeg-ef4TJkKCXEXWcGJ8DRGTuHxW4qWB3pNNU7Y0KdrbNvGK";
			// get请求获取页面信息
			String bb = doget(url);
			Document doc;
			//用jsoup接收页面信息
			doc = Jsoup.parse(bb);
			// 选择所有div的class为para的标签
			Elements news = doc.select("div[class=para]");
			for (Element result : news) {
				//获取标签的内容并打印
				String str=result.text();
				System.out.println(str);
			}
			
			
		} catch (Exception e) {
			e.printStackTrace();
		}

	}
 
	public static String doget(String path) {
		InputStream is = null;
		ByteArrayOutputStream baos = null;
		try {
			// 伪造referer 绕过防盗链设置
			URL url = new URL(path.trim());
			HttpURLConnection conn = (HttpURLConnection) url.openConnection();
			 
			if (200 == conn.getResponseCode()) {
				byte[] buff = new byte[4096];
				int count;
				ByteArrayOutputStream out = new ByteArrayOutputStream(4096);
				InputStream in = conn.getInputStream();

				while ((count = in.read(buff)) != -1) {
					out.write(buff, 0, count);
				}
				conn.disconnect();
				return out.toString("UTF-8");
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (baos != null) {
				try {
					baos.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			if (is != null) {
				try {
					is.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}

		}

		return null;
	}

	 

  

 

 

转载注明引用

posted @ 2017-03-29 14:16  安宁~  阅读(1271)  评论(0编辑  收藏  举报