Java微博爬虫-每日百万数据
没写过爬虫,赶鸭子上架,公司里有个老代码,我就拿来参考,边看边写3周后,把整个代码大换血,实现了单日单程序百万的爬取量。
使用springboot + JDK1.8 + mysql + redis。
主要有 关键词爬取、redis队列、多线程爬取程序 三部分。
一、关键词的爬取:
我用的是在微博搜索栏输入关键词的方法爬取数据,所以关键词的热度、新鲜度很重要。
我在百度、搜狗、微博这几个网站的热搜榜每隔40秒抓取一次实时的热词。
第一步,找热词质量高的网站。
# 百度热搜网址 baidu.hotnews = http://top.baidu.com/buzz?b=1&fr=topnews baidu.topcategory = http://top.baidu.com/buzz?b=2&c=12&fr=topcategory_c12 baidu.oneday.hotbuzz = http://top.baidu.com/buzz?b=341&fr=topbuzz_b1 baidu.oneday.lifehot = http://top.baidu.com/buzz?b=342&c=513&fr=topbuzz_b344_c513 # 微博热搜网址 weibo.realtimehot = https://s.weibo.com/top/summary?cate=realtimehot weibo.realtime = https://weibo.com/a/hot/realtime # 搜狗热搜网址 sogou.hotTop1 = http://top.sogou.com/hot/shishi_1.html sogou.hotTop2 = http://top.sogou.com/hot/shishi_2.html sogou.hotTop3 = http://top.sogou.com/hot/shishi_3.html # 360热搜网址 360.hotlist.star = https://trends.so.com/top/list?cate1=%E4%BA%BA%E7%89%A9&cate2=%E6%98%8E%E6%98%9F&page=1&size=100 360.hotlist.netstar = https://trends.so.com/top/list?cate1=%E4%BA%BA%E7%89%A9&cate2=%E7%BD%91%E7%BA%A2&page=1&size=100 360.hotlist.famous = https://trends.so.com/top/list?cate1=%E4%BA%BA%E7%89%A9&cate2=%E5%90%8D%E5%AE%B6&page=1&size=100 360.hotlist.website = https://trends.so.com/top/list?cate1=%E7%BD%91%E7%AB%99&cate2=&page=1&size=100 360.hotlist.ip = https://trends.so.com/top/list?cate1=IP&cate2=&page=1&size=100 360.hotlist.ai = https://trends.so.com/top/list?cate1=%E6%99%BA%E8%83%BD%E7%BB%88%E7%AB%AF&cate2=%E6%89%8B%E6%9C%BA&page=10&size=100 360.hotlist.car = https://trends.so.com/top/list?cate1=%E6%B1%BD%E8%BD%A6&cate2=&page=11&size=100 360.hotlist.live = https://trends.so.com/top/list?cate1=%E7%9B%B4%E6%92%AD&cate2=%E4%B8%BB%E6%92%AD&page=8&size=80 360.hotlist.livesite = https://trends.so.com/top/list?cate1=%E7%9B%B4%E6%92%AD&cate2=%E7%9B%B4%E6%92%AD%E5%B9%B3%E5%8F%B0&page=6&size=60 360.hotlist.drink = https://trends.so.com/top/list?cate1=%E9%85%92%E7%B1%BB&cate2=&page=1&size=40 360.hotlist.carton = https://trends.so.com/top/list?cate1=%E5%A8%B1%E4%B9%90&cate2=%E5%8A%A8%E6%BC%AB&page=1&size=100 360.hotlist.sports = https://trends.so.com/top/list?cate1=%E5%A8%B1%E4%B9%90&cate2=%E4%BD%93%E8%82%B2&page=1&size=100 360.hotlist.music = https://trends.so.com/top/list?cate1=%E5%A8%B1%E4%B9%90&cate2=%E9%9F%B3%E4%B9%90&page=1&size=100 360.hotlist.movie = https://trends.so.com/top/list?cate1=%E5%A8%B1%E4%B9%90&cate2=%E7%94%B5%E5%BD%B1&page=8&size=100 360.hotlist.tv = https://trends.so.com/top/list?cate1=%E9%85%92%E7%B1%BB&cate2=&page=6&size=100 360.hotlist.fun = https://trends.so.com/top/list?cate1=%E5%A8%B1%E4%B9%90&cate2=%E7%94%B5%E8%A7%86%E5%89%A7&page=6&size=100 360.hotlist.novel = https://trends.so.com/top/list?cate1=%E5%A8%B1%E4%B9%90&cate2=%E5%B0%8F%E8%AF%B4&page=1&size=100 360.hotlist.game = https://trends.so.com/top/list?cate1=%E5%A8%B1%E4%B9%90&cate2=%E6%B8%B8%E6%88%8F&page=6&size=100 360.hotlist.cosmetics = https://trends.so.com/top/list?cate1=%E5%8C%96%E5%A6%86%E5%93%81&cate2=&page=4&size=40 360.hotlist.luxury = https://trends.so.com/top/list?cate1=%E5%A5%A2%E4%BE%88%E5%93%81&cate2=&page=3&size=30
(附上爬取关键词网页,这些热词的质量极高)
第二步,热词爬取:
以微博热搜榜的爬取为例。
String str= "https://s.weibo.com/top/summary?cate=realtimehot";//网页链接 HotListSearch hotListSearch = new HotListSearch();//创建爬取热词对象 List<Keywords> keywords = hotListSearch.queryWeibo(str);//使用爬取微博方法 int i =1; for(Keywords key:keywords){ System.out.println("No."+i+"==========="+key.toString()); i++; }//将爬取的结果封装成java对象
HotListSearch.class
public class HotListSearch { public HotListSearch() { this(null); } private HttpProxy proxy; public HotListSearch(HttpProxy proxy) { this.proxy = proxy; } /* * 微博 * */ public List<Keywords> queryWeibo(String url) { Connect connect = new Connect(); String html = connect.get(url, proxy); String str = "div[class=data] tbody tr";//jsoup需抓取的css标识 List<Keywords> keywords = parseWeibo(html,str);//解析html为需要的集合 return keywords; } /* * 解析 HTML变集合 * */ private List<Keywords> parseWeibo(String html,String str) { if (html == null || html.isEmpty()) return null; Document doc = Jsoup.parse(html);//解析html为java对象 Elements list = doc.select(str);//根据css标识把Document分为集合 if (list == null || list.isEmpty()) return null; List<Keywords> keywords = new ArrayList<>(); for (int i = 0, len = list.size(); i < len; i++) { try { HotSearchElementParser parser = new HotSearchElementParser();//解析list中每一个元素的工具,变为java对象 Keywords key = parser.parseSearchWeibo(list.get(i));//将元素变为关键词对象 if(key!=null) keywords.add(key); } catch (Exception e) { e.getMessage(); } } return keywords; } }
HotSearchElementParser.class
public class HotSearchElementParser { public Keywords parseSearchWeibo(Element item) throws ParseException{ Keywords keywords=parseSearch(); String querystr=item.select("td[class=td-02] a").text();//获取热词 if(querystr==null||querystr.isEmpty()){ return null; } keywords.setQuerystr(querystr); return keywords; } }
Keywords.class
/** * 下载关键词 * */ public class Keywords implements Serializable { private static final long serialVersionUID = 1L; private int id; private String querystr; private String region; // keywords region private String nodup; // keywords nodup private int status; // 状态,1:正在下载、2:暂停下载 private long next; // 下一次加载 private String growth; // 最近 5 次下载数量 private long lastDownloadTime; // 最后下载时间 private int total; // total downloads private int amount; // amount of downloads private String updateDate; public int getId() { return id; } public void setId(int id) { this.id = id; } public String getQuerystr() { return querystr; } public void setQuerystr(String querystr) { this.querystr = querystr; } public String getRegion() { return region; } public void setRegion(String region) { this.region = region; } public String getNodup() { return nodup; } public void setNodup(String nodup) { this.nodup = nodup; } public int getStatus() { return status; } public void setStatus(int status) { this.status = status; } public long getNext() { return next; } public void setNext(long next) { this.next = next; } public String getGrowth() { return growth; } public void setGrowth(String growth) { this.growth = growth; } public long getLastDownloadTime() { return lastDownloadTime; } public void setLastDownloadTime(long lastDownloadTime) { this.lastDownloadTime = lastDownloadTime; } public int getTotal() { return total; } public void setTotal(int total) { this.total = total; } public int getAmount() { return amount; } public void setAmount(int amount) { this.amount = amount; } public String getUpdateDate() { return updateDate; } public void setUpdateDate(String updateDate) { this.updateDate = updateDate; } @Override public String toString() { return "Keywords{" + "id=" + id + ", querystr='" + querystr + '\'' + ", region='" + region + '\'' + ", nodup='" + nodup + '\'' + ", status=" + status + ", next=" + next + ", growth='" + growth + '\'' + ", lastDownloadTime=" + lastDownloadTime + ", total=" + total + ", amount=" + amount + ", updateDate=" + updateDate + '}'; } }
Connect.class
package com.cnxunao.common.utils; import com.cnxunao.weibospider.entities.HttpProxy; import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpHost; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.CredentialsProvider; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.config.RequestConfig.Builder; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.InetSocketAddress; import java.net.Proxy; import java.net.URL; import java.net.URLConnection; import java.nio.charset.StandardCharsets; import java.util.Random; public class Connect { private static Logger logger = LoggerFactory.getLogger(Connect.class); public String get(String url) { return get(url, null); } public String get(String url, HttpProxy proxy) { try (CloseableHttpClient httpclient = HttpClients.custom().setUserAgent(this.userAgent).build()) { HttpGet request = new HttpGet(url.trim()); HttpContext context = createContext(proxy); try (CloseableHttpResponse response = httpclient.execute(request, context)) { return EntityUtils.toString(response.getEntity(), charset); } } catch (Exception e) { e.printStackTrace(); throw new IllegalArgumentException("timeout"); } } public String getKeyword(String targetUrl, HttpProxy proxy) { String proxyHost = proxy.getHost(); int proxyPort = proxy.getPort(); Proxy.Type proxyType = Proxy.Type.SOCKS; try { InetSocketAddress addr = new InetSocketAddress(proxyHost, proxyPort); Proxy Httpproxy = new Proxy(proxyType, addr); URL url = new URL(targetUrl); URLConnection conn = url.openConnection(Httpproxy); InputStream in = conn.getInputStream(); return IO2String(in); } catch (Exception e) { e.printStackTrace(); throw new IllegalArgumentException("timeout"); } } public String get(String url, HttpProxy proxy, int reconnectionTimes) { if (reconnectionTimes < 2) return get(url, proxy); if (reconnectionTimes > 5) throw new IllegalArgumentException("Too many reconnection"); String html = null; for (int i = 0; i < reconnectionTimes; i++) { try { html = get(url, proxy); break; } catch (Exception e) { logger.error("reconnection: {}", url); try { Thread.sleep(1_500L); } catch (InterruptedException e1) { } } } if (html == null) throw new IllegalArgumentException("timeout"); return html; } private HttpContext createContext(HttpProxy proxy) { HttpClientContext context = HttpClientContext.create(); Builder builder = RequestConfig.custom().setConnectTimeout(timeout).setSocketTimeout(timeout); if (proxy != null && StringUtils.isNotEmpty(proxy.getHost())) { builder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort())); if (StringUtils.isNotEmpty(proxy.getUsername()) && StringUtils.isNotEmpty(proxy.getPassword())) { CredentialsProvider credsProvider = new BasicCredentialsProvider(); credsProvider.setCredentials(new AuthScope(proxy.getHost(), proxy.getPort()), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); context.setCredentialsProvider(credsProvider); } } RequestConfig config = builder.build(); context.setRequestConfig(config); return context; } private static Random random = new Random(); // private String userAgent = "Opera/9.27 (Windows NT 5.2; U; zh-cn)"; private String userAgent = userAgents[random.nextInt(14)]; public void setUserAgent(String userAgent) { this.userAgent = userAgent; } private String charset = "UTF-8"; public void setCharset(String charset) { this.charset = charset; } private int timeout = 15_000; public void setTimeout(int timeout) { this.timeout = timeout; } public static String IO2String(InputStream inStream) throws IOException { ByteArrayOutputStream result = new ByteArrayOutputStream(); byte[] buffer = new byte[1024]; int len; while ((len = inStream.read(buffer)) != -1) { result.write(buffer, 0, len); } String str = result.toString(StandardCharsets.UTF_8.name()); return str; } //user_Agent池 private static String[] userAgents = { "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)", "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)" }; }
HttpResponse.class
package com.cnxunao.weibospider.utils; import java.util.Vector; public class HttpResponse { String urlString; int defaultPort; String file; String host; String path; int port; String protocol; String query; String ref; String userInfo; String contentEncoding; int contentLength; String content; String contentType; int code; String message; String method; int connectTimeout; int readTimeout; Vector<String> contentCollection; public String getContent() { return content; } public String getContentType() { return contentType; } public int getCode() { return code; } public String getMessage() { return message; } public Vector<String> getContentCollection() { return contentCollection; } public String getContentEncoding() { return contentEncoding; } public String getMethod() { return method; } public int getConnectTimeout() { return connectTimeout; } public int getReadTimeout() { return readTimeout; } public String getUrlString() { return urlString; } public int getDefaultPort() { return defaultPort; } public String getFile() { return file; } public String getHost() { return host; } public String getPath() { return path; } public int getPort() { return port; } public String getProtocol() { return protocol; } public String getQuery() { return query; } public String getRef() { return ref; } public String getUserInfo() { return userInfo; } }
测试通过后,使用@Scheduled来写一个线程,把爬取到的关键词定时加入redis队列
WeiboHotThread.class
/* * 爬取 微博实时榜 * */ @Component @EnableScheduling public class WeiboHotThread { protected Logger logger = LoggerFactory.getLogger(getClass()); @Autowired RedisTempService redisService; @Autowired private HotListSearch hotListSearch; @Scheduled(initialDelay = 80_000,fixedRate= 120_000) public void run(){ System.out.println("开始执行微博"); if(redisService.count("KeywordsQueue")<=600) { List<Keywords> list=hotListSearch.queryWeibo("https://s.weibo.com/top/summary?cate=realtimehot"); Keywords[] array=new Keywords[list.size()]; for(int i=0;i<list.size();i++){ Keywords keywords=list.get(i); array[i]=keywords; } redisService.lpush("KeywordsQueue",array);//装入redis队列 logger.info("Successful download keywords,add to redis: "+array.length); } } }
RedisTempService.class(redis具体使用操作方法,这里就不做讲解了,直接附上一个方法)
//队列中插入元素 public void lpush(String key, Serializable... keywords){ redisTemplate.opsForList().leftPushAll(key,keywords); }
第三步,爬取微博关键词信息
爬取思路大概就是,写定时线程获取代理服务器和关键词,将关键词生成网页链接,使用代理请求该链接,获取返回值,将返回值处理成java对象后写成xml,再写一个线程定时将许多xml文件打成jar包,之后jar包任君处置。
下面贴出部分代理用于参考。
AbstractDownload.class
public abstract class AbstractDownload<T> { protected Logger logger = LoggerFactory.getLogger(getClass()); protected void exec(boolean multi, int multinum, int multiple, ThreadPoolExecutor executor) { if (multi) multi(multinum, multiple, executor); else single(); } private void multi(int multinum, int multiple, ThreadPoolExecutor executor) { if (multinum == 1) { single(); return; } List<HttpProxy> proxys = getValidProxy(multinum); List<T> entities = getValidEntity(proxys.size() * multiple); int total = entities.size(); int len = total / multiple + (total % multiple == 0 ? 0 : 1); CompletableFuture<?>[] cfs = IntStream.range(0, len).mapToObj(i -> { HttpProxy proxy = proxys.get(i); CopyOnWriteArrayList<T> list =new CopyOnWriteArrayList(entities.subList(i * multiple, i == len - 1 ? total : (i + 1) * multiple).toArray()); return CompletableFuture.runAsync(() -> { download(proxy, list); }, executor); }).toArray(CompletableFuture[]::new); CompletableFuture.allOf(cfs).join(); } private void single() { HttpProxy proxy = getValidProxy(1).get(0); T entity = getValidEntity(1).get(0); download(proxy, entity); } private void download(HttpProxy proxy, CopyOnWriteArrayList<T> entities) { for (int i = 0, len = entities.size(); i < len; i++) { try { download(proxy, entities.get(i)); entities.remove(i); } catch (Exception e) { logger.error(e.getMessage()); } finally { // 最后一次下载之后将更换代理,不用暂停 if (i < len - 1) { try { Thread.sleep(getPauseTime()); } catch (InterruptedException e) { e.printStackTrace(); } } } } } @Autowired RedisTempService redisService; public abstract void download(HttpProxy proxy, T entity); // 校验下载实体 protected abstract void validate(T entity); // 查询 protected abstract List<Weibo> query(HttpProxy proxy, T entity); // 下载完成,更新下一次的下载时间 protected abstract void updateEntity(T entity, List<Weibo> weibos); // 日志保存 protected abstract void saveDownloadLog(T entity, HttpProxy proxy, long consumeTime, List<Weibo> weibos); /* * 下载微博写入临时文件 */ protected void storeWeibos(List<Weibo> weibos) { if (weibos == null || weibos.isEmpty()) return; try { WeiboUtils.writeToTempXml(weibos); } catch (IOException e) { logger.error("write temp xml error.", e); } } protected abstract List<HttpProxy> getValidProxy(int size); protected abstract List<T> getValidEntity(int size); // 两次下载之间的间隔时间 protected int getPauseTime() { return 1000 * RandomUtils.nextInt(3, 5); } protected static class DefaultThreadFactory implements ThreadFactory { private final AtomicInteger threadNumber = new AtomicInteger(1); private final String namePrefix; DefaultThreadFactory(String namePrefix) { this.namePrefix = namePrefix; } @Override public Thread newThread(Runnable r) { Thread t = new Thread(r, namePrefix + threadNumber.getAndIncrement()); if (t.isDaemon()) t.setDaemon(false); if (t.getPriority() != Thread.NORM_PRIORITY) t.setPriority(Thread.NORM_PRIORITY); return t; } } }
DownloadKeywordThread.class
@Component @EnableAsync @EnableScheduling public class DownloadKeywordThread extends AbstractDownload<Keywords> { @Value("${download.keyword.use}") private boolean use; @Value("${download.keyword.multi}") private boolean multi; @Value("${download.keyword.multinum}") private int multinum; @Value("${download.keyword.multiple}") private int multiple; @Autowired HttpProxyService proxyService; private ThreadPoolExecutor executor; public DownloadKeywordThread() { int nThreads = Runtime.getRuntime().availableProcessors()*3; executor = new ThreadPoolExecutor(nThreads, nThreads, 0, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>(100), new DefaultThreadFactory("download.keyword-")); } @Async @Scheduled(initialDelay = 10_000, fixedRate = 1_000) public void run() throws InterruptedException { System.out.println("开始执行关键词"); if (use) { try { exec(multi, multinum, multiple, executor); } catch (Exception e) { logger.info(e.getMessage()); } } } @Override protected void validate(Keywords entity) { if (StringUtils.isEmpty(entity.getQuerystr())) { entity.setStatus(Constants.STATUS_SUSPEND); kwService.saveOrUpdate(entity); throw new IllegalArgumentException("Keywords not null"); } } @Override protected List<Weibo> query(HttpProxy proxy, Keywords kw) { List<Weibo> weibos = null; for (int i = 0; i < 3; i++) { try { KeywordsSearch download = new KeywordsSearch(proxy); weibos = download.query(kw); proxy.setSuccess(proxy.getSuccess() + 1); logger.info("Successful download, weibos: {}, keywords: {}, proxy: {}", weibos.size(), kw.getQuerystr(), proxy != null ? proxy.getHost() : ""); break; }catch(NullPointerException e1){ // 动态代理被限制 logger.error("proxyIp {} is limit by weibo", proxy.getHost()); proxy.setFailure(proxy.getFailure()+1); break; }catch (Exception e) { // 连接动态代理失败 if ("timeout".equals(e.getMessage())) { logger.error("can not connect to proxyIp: {} ", proxy.getHost()); proxy.setFailure(proxy.getFailure()+1); break; } // 微博没有相关结果 if ("noresult".equals(e.getMessage())) { logger.error("Keywords {} not found relevant results", kw.getQuerystr()); break; } // 代理需要人工输入验证码 if ("verification".equals(e.getMessage())) { proxy.setFailure(proxy.getFailure() + 1); proxy.setStatus(Constants.STATUS_SUSPEND); logger.error("Proxy {}:{} requires verification code", proxy.getHost(), proxy.getPort()); break; } } finally { queryFinally(proxy); } } return weibos; } @Autowired DownloadLogService logService; @Override protected void saveDownloadLog(Keywords entity, HttpProxy proxy, long consumeTime, List<Weibo> weibos) { logService.storeLog(entity.getQuerystr(), proxy, Constants.TYPE_KEYWORDS, consumeTime, weibos); } /* * 有效代理 */ @Override protected List<HttpProxy> getValidProxy(int size) { List<HttpProxy> list = StaticService.getVailid().stream() // 最近至少6秒内未使用 .filter(proxy -> proxy.getLastUseTime() + 6_000 < System.currentTimeMillis()) .collect(Collectors.toList()); if (CollectionUtils.isEmpty(list)) throw new IllegalArgumentException("not found valid proxy"); return list; } @Autowired KeywordsService kwService; @Autowired RedisTempService redisService; /* * 关键词,size = proxy.size * 10 */ @Override protected List<Keywords> getValidEntity(int size) { List<Serializable> list= (List<Serializable>) redisService.rpop("KeywordsQueue",size); JSONArray jsonArray = JSONArray.fromObject(list); List arrayList = JSONArray.toList(jsonArray,Keywords.class); if (CollectionUtils.isEmpty(list)) throw new IllegalArgumentException("not found valid keywords"); return arrayList; } @Override protected void updateEntity(Keywords entity, List<Weibo> weibos) { kwService.updateAfterDownload(entity, weibos); } private void queryFinally(HttpProxy proxy){ if(proxy.getFailure()<=3 && proxy.getLiveTime()>(System.currentTimeMillis()/1000)){ proxy.setStatus(1); StaticService.update(proxy); proxyService.saveOrUpdate(proxy); }else { proxyService.deleteByHostAndPort(proxy.getHost(),proxy.getPort()); StaticService.del(proxy); } } @Override public void download(HttpProxy proxy, Keywords entity){ try { long consumeTime = System.currentTimeMillis(); List<Weibo> weibos = query(proxy, entity); storeWeibos(weibos); if(entity!=null){ if(!(entity.getRegion().equalsIgnoreCase("hot"))){ updateEntity(entity, weibos); } } consumeTime = System.currentTimeMillis() - consumeTime; saveDownloadLog(entity, proxy, consumeTime, weibos); } catch (Exception e) { e.printStackTrace(); } } }
Storage.class(将xml文件打为jar包)
@Component public class Storage { private static Logger logger = LoggerFactory.getLogger(Storage.class); private BloomFilter<String> filter; public Storage() { int expectedInsertions = Integer.MAX_VALUE >> 4; filter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), expectedInsertions); } @Scheduled(initialDelay = 10_000,fixedRate = 540_000) public void run() { logger.info("storage thread running."); try { JSONArray jArray = readTempXml(); if (jArray == null || jArray.isEmpty()) return; writeToZip(jArray); } catch (Exception e) { logger.error(e.getMessage()); } } private void writeToZip(JSONArray jArray) { // 保存的文件名 String filename = getFilename(jArray); try (ZipOutputStream output = new ZipOutputStream(new FileOutputStream(filename))) { int total = jArray.size(), xmlsize = 100; for (int i = 0, len = total / xmlsize + (total % xmlsize == 0 ? 0 : 1); i < len; i++) { int fromIndex = i * xmlsize, toIndex = i == len - 1 ? total : (i + 1) * xmlsize; JSONArray list = JSONArray.fromObject(jArray.subList(fromIndex, toIndex)); ZipEntry entry = new ZipEntry((i + 1) + ".xml"); output.putNextEntry(entry); XmlWriter writer = new XmlWriter(); writer.write(list, output); } } catch (Exception e) { logger.error("write to zip: {}", e.getMessage()); } logger.info("{}\t{}", jArray.size(), filename); WeiboUtils.total+=jArray.size(); logger.info("下载总数:{}", WeiboUtils.total); } private String getFilename(JSONArray jArray) { File directory = new File( Constants.STORE_BASE + File.separator + DateFormatUtils.format(new Date(), "yyyyMMdd")); if (!directory.exists()) directory.mkdirs(); int index; Collection<File> c = FileUtils.listFiles(directory, new String[] { "zip" }, true); if (!c.isEmpty()) { index = c.stream().mapToInt(file -> { String filename = StringUtils.substringBefore(file.getName(), "_"); return NumberUtils.toInt(filename); }).max().getAsInt() + 1; } else { index = 1; } return directory.getPath() + File.separator + index + "_" + jArray.size() + ".zip"; } AtomicLong incr = new AtomicLong(100_000_000L); private JSONArray readTempXml() { File directory = new File(Constants.STORE_TEMP); if (!directory.isDirectory()) { logger.error("{} is not a directory", directory.getPath()); return null; } Collection<File> c = FileUtils.listFiles(directory, new String[] { "xml" }, true); if (c.isEmpty()) { logger.info("XML file not found"); return null; } JSONArray jArray = new JSONArray(); for (File file : c) { try { XmlReader reader = new XmlReader(); JSONArray subArray = reader.read(file.getAbsolutePath()); logger.info("read temp xml: " + file.getAbsolutePath()); for (int i = 0, len = subArray.size(); i < len; i++) { JSONObject jObject = subArray.getJSONObject(i); try { String ur = jObject.getString("ur"); String md5Hex = DigestUtils.md5DigestAsHex(ur.getBytes()); md5Hex += incr.incrementAndGet(); if (!filter.mightContain(md5Hex)) { jArray.add(jObject); filter.put(md5Hex); } } catch (Exception e) { } } } catch (Exception e) { logger.error("read xml: {}", e.getMessage()); } finally { file.delete(); } } return jArray; } }
XmlReader.class
public class XmlReader { public XmlReader() { } public JSONArray read(String filename) throws IOException, ParserConfigurationException, SAXException { try (InputStream input = new FileInputStream(filename)) { return read(input); } } public JSONArray read(InputStream input) throws ParserConfigurationException, SAXException, IOException { Document document = buildDocument(input); // 节点列表 NodeList nodes = document.getElementsByTagName("article"); JSONArray jArray = new JSONArray(); for (int i = 0, len = nodes.getLength(); i < len; i++) { // 子节点列表 NodeList cNodes = nodes.item(i).getChildNodes(); if (cNodes.getLength() == 0) continue; JSONObject jObject = new JSONObject(); for (int j = 0; j < cNodes.getLength(); j++) { Node cNode = cNodes.item(j); if (StringUtils.isNotBlank(cNode.getTextContent())) // 子节点名称和值 jObject.put(cNode.getNodeName().toLowerCase(), cNode.getTextContent()); } if (jObject.size() > 0) jArray.add(jObject); } return jArray; } private Document buildDocument(InputStream in) throws ParserConfigurationException, SAXException, IOException { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); return builder.parse(in); } }
XmlWriter.class
public class XmlWriter { public void write(JSONArray jArray, OutputStream output) throws IOException { String xmlContent; try { xmlContent = toXmlstr(jArray); } catch (TransformerException | ParserConfigurationException e) { throw new IOException(e); } IOUtils.write(xmlContent, output, "UTF-8"); } private String toXmlstr(JSONArray jArray) throws IOException, TransformerException, ParserConfigurationException { TransformerFactory factory = TransformerFactory.newInstance(); factory.setAttribute("indent-number", 4); // 设置缩进长度 Transformer transformer = factory.newTransformer(); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); // 设置自动换行 StringWriter writer = new StringWriter(); Source source = new DOMSource(buildDocument(jArray)); transformer.transform(source, new StreamResult(writer)); return writer.toString(); } private Document buildDocument(JSONArray jArray) throws ParserConfigurationException { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); Document document = builder.newDocument(); // parent Element root = document.createElement("articles"); document.appendChild(root); for (int i = 0, len = jArray.size(); i < len; i++) { JSONObject jObject = jArray.getJSONObject(i); // children Element item = document.createElement("article"); root.appendChild(item); for (Object key : jObject.keySet()) { String field = (String) key, value = jObject.getString(field); if (value == null || value.isEmpty()) continue; // attribute Element attr = document.createElement(field); attr.setTextContent(value); item.appendChild(attr); } } return document; } }
爬取程序的结构用自己的就行,主要说一下每日单程序爬取百万数的问题:
1.微博的反爬。
我用的方法有:1)使用动态代理服务器 买了一个ip池每天2500个ip,我用的快代理。
2)使用user-agent池,这个之前的博客有写过。
3)抓取的速度在1秒1次还是可以的。
2.关键词质量。
用的方法:抓取微博、百度、搜狗、360热搜榜。
3.程序稳定性、持久性。
使用多线程+spring框架+定时重启程序的方法。
本人也刚学习爬虫有许多不足,请多指教。
爬虫虽好,可不要贪杯啊。
原创文章,转发请私聊。