定向爬虫小例子
# --*-- coding:utf-8 --*-- import urllib2 from lxml import etree import Queue import time import os def getHtml(url): request = urllib2.Request(url) request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0') doc = urllib2.urlopen(request, timeout=45).read().decode('gbk') return doc seed = 'http://it.dataguru.cn/' seed = 'http://bi.dataguru.cn/' seed = 'http://science.dataguru.cn/' que_urls = Queue.Queue() que_urls.put(seed) def getCurTimeStamp(root='/data/data/dataguru/science/'): """ 获取当前时间戳:离1970年1月1日午夜开始的毫秒数 :return: """ return root + str(int(time.time() * 1000)) + '.txt' def start(): while que_urls.qsize() > 0: url = que_urls.get() html = getHtml(url) dom = etree.HTML(html) # links = dom.xpath(u"//div[@id='ct']//a[@class='xi2']/@href") links = dom.xpath(u"//div[@id='ct']//a[@class='xi2']") print len(links) for lk in links: print lk.text, lk.xpath('./@href') try: link = lk.xpath('./@href')[0] html_c = getHtml(link) dom_c = etree.HTML(html_c) article = dom_c.xpath('//td[@id="article_content"]//text()') content = os.linesep.join(article) content = content.replace('\r\n', '') with open(getCurTimeStamp(), 'wb') as mf: mf.write(link + os.linesep) mf.write(lk.text.encode('utf-8') + os.linesep) mf.write(content.encode('utf-8')) except Exception, e: print e continue links_next = dom.xpath('//div[@id="ct"]//a[@class="nxt"]') for lk in links_next: print lk.text, lk.xpath('./@href') que_urls.put(lk.xpath('./@href')[0]) import jieba if __name__ == '__main__': #start() sen = '我来到北京清华大学' sen = '他来到了网易杭研大厦' seg_list = jieba.cut(sen, cut_all=False) res = "/ ".join(seg_list) print type(seg_list) print "Default Mode:", "/ ".join(seg_list) # 精确模式
package com.data.crawl.qa.baiduzhidao;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;
import java.util.Queue;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
public class Crawl {
private static Log log = LogFactory.getLog(HttpClientPool.class);
private HtmlCleaner cleaner = new HtmlCleaner();
private HttpClientPool httpPool = new HttpClientPool();
private Queue<String> queue = new LinkedList<String>();
private Pattern Pat_index = Pattern.compile("http://zhidao.baidu.com/browse/\\d+(\\?pn=\\d+#list)?");
// http://zhidao.baidu.com/browse/82?pn=25#list
// http://zhidao.baidu.com/browse/82?pn=50#list
// http://zhidao.baidu.com/browse/82
private Pattern Pat_content = Pattern.compile("http://zhidao.baidu.com/question/\\d+.html\\?entry=qb_browse_default");
// http://zhidao.baidu.com/question/1732680699842305627.html?entry=qb_browse_default
// http://zhidao.baidu.com/question/368440625636623924.html?entry=qb_browse_default
// http://zhidao.baidu.com/question/1946360168489647948.html?entry=qb_browse_default
public void start(String seed) {
queue.add(seed);
while (queue.size() > 0) {
String uri = queue.poll();
String html = httpPool.downHtml(uri);
if (Pat_index.matcher(uri).find()) {
getOutlinks(html, uri);
}else if(Pat_content.matcher(uri).find()){
getFields(html, uri);
}else{
log.info("regex err: " + uri);
}
}
}
private void getFields(String html, String uri) {
// TODO Auto-generated method stub
TagNode doc = cleaner.clean(html);
try {
Object[] tags_title = doc.evaluateXPath("//span[@class='ask-title ']");
String title = ((TagNode)tags_title[0]).getText().toString();
log.info(title);
} catch (XPatherException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) {
Crawl crawl = new Crawl();
String seed = "http://zhidao.baidu.com/browse/82";
crawl.start(seed);
log.info("complete");
}
public void getOutlinks(String html, String base) {
TagNode doc = cleaner.clean(html);
try {
URL baseUrl = new URL(base);
Object[] tags_content = doc.evaluateXPath("//a[@class='question-title']");
for (Object object : tags_content) {
String relativeurl = ((TagNode) object).getAttributeByName("href");
URL url = new URL(baseUrl, relativeurl);
queue.add(url.toString());
}
Object[] tags_next = doc.evaluateXPath("//a[@class='pager-next']");
String relative_url_next = ((TagNode) tags_next[0]).getAttributeByName("href");
URL url = new URL(baseUrl, relative_url_next);
queue.add(url.toString());
} catch (XPatherException e) {
log.warn(e.getMessage());
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
}
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;
import java.util.Queue;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
public class Crawl {
private static Log log = LogFactory.getLog(HttpClientPool.class);
private HtmlCleaner cleaner = new HtmlCleaner();
private HttpClientPool httpPool = new HttpClientPool();
private Queue<String> queue = new LinkedList<String>();
private Pattern Pat_index = Pattern.compile("http://zhidao.baidu.com/browse/\\d+(\\?pn=\\d+#list)?");
// http://zhidao.baidu.com/browse/82?pn=25#list
// http://zhidao.baidu.com/browse/82?pn=50#list
// http://zhidao.baidu.com/browse/82
private Pattern Pat_content = Pattern.compile("http://zhidao.baidu.com/question/\\d+.html\\?entry=qb_browse_default");
// http://zhidao.baidu.com/question/1732680699842305627.html?entry=qb_browse_default
// http://zhidao.baidu.com/question/368440625636623924.html?entry=qb_browse_default
// http://zhidao.baidu.com/question/1946360168489647948.html?entry=qb_browse_default
public void start(String seed) {
queue.add(seed);
while (queue.size() > 0) {
String uri = queue.poll();
String html = httpPool.downHtml(uri);
if (Pat_index.matcher(uri).find()) {
getOutlinks(html, uri);
}else if(Pat_content.matcher(uri).find()){
getFields(html, uri);
}else{
log.info("regex err: " + uri);
}
}
}
private void getFields(String html, String uri) {
// TODO Auto-generated method stub
TagNode doc = cleaner.clean(html);
try {
Object[] tags_title = doc.evaluateXPath("//span[@class='ask-title ']");
String title = ((TagNode)tags_title[0]).getText().toString();
log.info(title);
} catch (XPatherException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) {
Crawl crawl = new Crawl();
String seed = "http://zhidao.baidu.com/browse/82";
crawl.start(seed);
log.info("complete");
}
public void getOutlinks(String html, String base) {
TagNode doc = cleaner.clean(html);
try {
URL baseUrl = new URL(base);
Object[] tags_content = doc.evaluateXPath("//a[@class='question-title']");
for (Object object : tags_content) {
String relativeurl = ((TagNode) object).getAttributeByName("href");
URL url = new URL(baseUrl, relativeurl);
queue.add(url.toString());
}
Object[] tags_next = doc.evaluateXPath("//a[@class='pager-next']");
String relative_url_next = ((TagNode) tags_next[0]).getAttributeByName("href");
URL url = new URL(baseUrl, relative_url_next);
queue.add(url.toString());
} catch (XPatherException e) {
log.warn(e.getMessage());
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
}