本文要分析的是FetchDNS处理器,该处理器的功能是解析CrawlURI curi对象的DNS地址,该处理器是采用dnsjava-2.0.3.jar组件进行解析DNS的(我们可以参考本文代码采用dnsjava-2.0.3.jar组件API解析DNS)
FetchDNS处理器的重要成员变量
// Defaults. private short ClassType = DClass.IN; private short TypeType = Type.A; protected InetAddress serverInetAddr = null; /** * Used to do DNS lookups. */ protected ServerCache serverCache; public ServerCache getServerCache() { return this.serverCache; } @Autowired public void setServerCache(ServerCache serverCache) { this.serverCache = serverCache; } /** * Whether or not to perform an on-the-fly digest hash of retrieved * content-bodies. */ { setDigestContent(true); } public boolean getDigestContent() { return (Boolean) kp.get("digestContent"); } public void setDigestContent(boolean digest) { kp.put("digestContent",digest); } /** * Which algorithm (for example MD5 or SHA-1) to use to perform an * on-the-fly digest hash of retrieved content-bodies. */ String digestAlgorithm = "sha1"; public String getDigestAlgorithm() { return digestAlgorithm; } public void setDigestAlgorithm(String digestAlgorithm) { this.digestAlgorithm = digestAlgorithm; }
处理器void innerProcess(CrawlURI curi)方法
protected void innerProcess(CrawlURI curi) { Record[] rrecordSet = null; // Retrieved dns records String dnsName = null; try { dnsName = curi.getUURI().getReferencedHost(); } catch (URIException e) { logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e); } if(dnsName == null) { curi.setFetchStatus(S_UNFETCHABLE_URI); return; } CrawlHost targetHost = getServerCache().getHostFor(dnsName); //IP地址转换为InetAddress类型 if (isQuadAddress(curi, dnsName, targetHost)) { // We're done processing. return; } // Do actual DNS lookup. curi.setFetchBeginTime(System.currentTimeMillis()); // Try to get the records for this host (assume domain name) // TODO: Bug #935119 concerns potential hang here String lookupName = dnsName.endsWith(".") ? dnsName : dnsName + "."; try { //DNS解析 rrecordSet = (new Lookup(lookupName, TypeType, ClassType)).run(); } catch (TextParseException e) { rrecordSet = null; } curi.setContentType("text/dns"); if (rrecordSet != null) { if (logger.isLoggable(Level.FINE)) { logger.fine("Found recordset for " + lookupName); } //设置CrawlHost targetHost对象的IP属性; CrawlURI curi对象的Recorder httpRecorder属性 storeDNSRecord(curi, dnsName, targetHost, rrecordSet); } else { if (logger.isLoggable(Level.FINE)) { logger.fine("Failed find of recordset for " + lookupName); } if (getAcceptNonDnsResolves()||"localhost".equals(dnsName)) { // Do lookup that bypasses javadns. InetAddress address = null; try { address = InetAddress.getByName(dnsName); } catch (UnknownHostException e1) { address = null; } if (address != null) { targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES); curi.setFetchStatus(S_GETBYNAME_SUCCESS); if (logger.isLoggable(Level.FINE)) { logger.fine("Found address for " + dnsName + " using native dns."); } } else { if (logger.isLoggable(Level.FINE)) { logger.fine("Failed find of address for " + dnsName + " using native dns."); } setUnresolvable(curi, targetHost); } } else { setUnresolvable(curi, targetHost); } } curi.setFetchCompletedTime(System.currentTimeMillis()); }
相关调用方法如下(dnsjava-2.0.3.jar组件的API)
/** * 设置CrawlHost targetHost对象的IP属性; CrawlURI curi对象的Recorder httpRecorder属性 * @param curi * @param dnsName * @param targetHost * @param rrecordSet */ protected void storeDNSRecord(final CrawlURI curi, final String dnsName, final CrawlHost targetHost, final Record[] rrecordSet) { // Get TTL and IP info from the first A record (there may be // multiple, e.g. www.washington.edu) then update the CrawlServer ARecord arecord = getFirstARecord(rrecordSet); if (arecord == null) { throw new NullPointerException("Got null arecord for " + dnsName); } //设置CrawlHost targetHost对象IP属性 targetHost.setIP(arecord.getAddress(), arecord.getTTL()); try { //CrawlURI curi对象的Recorder httpRecorder属性 recordDNS(curi, rrecordSet); curi.setFetchStatus(S_DNS_SUCCESS); curi.setDNSServerIPLabel(ResolverConfig.getCurrentConfig().server()); } catch (IOException e) { logger.log(Level.SEVERE, "Failed store of DNS Record for " + curi.toString(), e); setUnresolvable(curi, targetHost); } } /** * IP地址转换为InetAddress * @param curi * @param dnsName * @param targetHost * @return */ protected boolean isQuadAddress(final CrawlURI curi, final String dnsName, final CrawlHost targetHost) { boolean result = false; Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName); // If it's an ip no need to do a lookup if (matcher == null || !matcher.matches()) { return result; } result = true; // Ideally this branch would never be reached: no CrawlURI // would be created for numerical IPs if (logger.isLoggable(Level.WARNING)) { logger.warning("Unnecessary DNS CrawlURI created: " + curi); } try { targetHost.setIP(InetAddress.getByAddress(dnsName, new byte[] { (byte) (new Integer(matcher.group(1)).intValue()), (byte) (new Integer(matcher.group(2)).intValue()), (byte) (new Integer(matcher.group(3)).intValue()), (byte) (new Integer(matcher.group(4)).intValue()) }), CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs curi.setFetchStatus(S_DNS_SUCCESS); } catch (UnknownHostException e) { logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e); setUnresolvable(curi, targetHost); } return result; } /** * 封装到CrawlURI curi对象的Recorder httpRecorder属性 * @param curi * @param rrecordSet * @throws IOException */ protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet) throws IOException { //转换为byte[] final byte[] dnsRecord = getDNSRecord(curi.getFetchBeginTime(), rrecordSet); Recorder rec = curi.getRecorder(); // Shall we get a digest on the content downloaded? boolean digestContent = getDigestContent(); String algorithm = null; if (digestContent) { algorithm = getDigestAlgorithm(); rec.getRecordedInput().setDigest(algorithm); } else { rec.getRecordedInput().setDigest((MessageDigest)null); } //byte[]转换为InputStream,封装到CrawlURI curi对象的Recorder httpRecorder属性 InputStream is = curi.getRecorder().inputWrap( new ByteArrayInputStream(dnsRecord)); if (digestContent) { rec.getRecordedInput().startDigest(); } // Reading from the wrapped stream, behind the scenes, will write // files into scratch space try { while (is.read(this.reusableBuffer) != -1) { continue; } } finally { is.close(); rec.closeRecorders(); } curi.setContentSize(dnsRecord.length); if (digestContent) { curi.setContentDigest(algorithm, rec.getRecordedInput().getDigestValue()); } } /** * 转换为byte[] * @param fetchStart * @param rrecordSet * @return * @throws IOException */ protected byte [] getDNSRecord(final long fetchStart, final Record[] rrecordSet) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); // Start the record with a 14-digit date per RFC 2540 byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes(); baos.write(fetchDate); // Don't forget the newline baos.write("\n".getBytes()); int recordLength = fetchDate.length + 1; if (rrecordSet != null) { for (int i = 0; i < rrecordSet.length; i++) { byte[] record = rrecordSet[i].toString().getBytes(); recordLength += record.length; baos.write(record); // Add the newline between records back in baos.write("\n".getBytes()); recordLength += 1; } } return baos.toByteArray(); } protected void setUnresolvable(CrawlURI curi, CrawlHost host) { host.setIP(null, 0); curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE); } /** * 返回Record[] rrecordSet数组Type.A类型的Record元素 * @param rrecordSet * @return */ protected ARecord getFirstARecord(Record[] rrecordSet) { ARecord arecord = null; if (rrecordSet == null || rrecordSet.length == 0) { if (logger.isLoggable(Level.FINEST)) { logger.finest("rrecordSet is null or zero length: " + rrecordSet); } return arecord; } for (int i = 0; i < rrecordSet.length; i++) { if (rrecordSet[i].getType() != Type.A) { if (logger.isLoggable(Level.FINEST)) { logger.finest("Record " + Integer.toString(i) + " is not A type but " + rrecordSet[i].getType()); } continue; } arecord = (ARecord) rrecordSet[i]; break; } return arecord; }
FetchDNS处理器和后面的FetchHTTP处理器涉及到消息摘要算法MessageDigest digest 对象,我这里转自网上的一篇文章供参考
转自 http://huangyunbin.iteye.com/blog/1123442
MessageDigest的功能及用法
MessageDigest 类为应用程序提供信息摘要算法的功能,如 MD5 或 SHA 算法。信息摘要是安全的单向哈希函数,它接收任意大小的数据,并输出固定长度的哈希值。
MessageDigest 对象开始被初始化。该对象通过使用 update()方法处理数据。任何时候都可以调用 reset()方法重置摘要。一旦所有需要更新的数据都已经被更新了,应该调用digest() 方法之一完成哈希计算。
对于给定数量的更新数据,digest 方法只能被调用一次。在调用 digest 之后,MessageDigest 对象被重新设置成其初始状态。
1、public static MessageDigest getInstance(String algorithm)
throws NoSuchAlgorithmException
返回实现指定摘要算法的 MessageDigest 对象。
algorithm - 所请求算法的名称
2、public static MessageDigest getInstance(String algorithm,
String provider)
throws NoSuchAlgorithmException,
NoSuchProviderException
返回实现指定摘要算法的 MessageDigest 对象。
algorithm - 所请求算法的名称
provider - 提供者的名称。
3、public void update(byte[] input)
使用指定的 byte 数组更新摘要。
4、public byte[] digest()
通过执行诸如填充之类的最终操作完成哈希计算。在调用此方法之后,摘要被重置。
5、public static boolean isEqual(byte[] digesta,
byte[] digestb)
比较两个摘要的相等性。做简单的字节比较。
注意:Provider可以通过 java.security.Security.getProviders() 方法获取已注册提供者列表。比较常用的有“SUN”
SUN提供的常用的算法名称有:MD2
MD5
SHA-1
SHA-256
SHA-384
SHA-512
Code举例:
import java.security.*; public class myDigest { public static void main(String[] args) { myDigest my=new myDigest(); my.testDigest(); } public void testDigest() { try { String myinfo="我的测试信息"; //java.security.MessageDigest alg=java.security.MessageDigest.getInstance("MD5"); java.security.MessageDigest alga=java.security.MessageDigest.getInstance("SHA-1"); alga.update(myinfo.getBytes()); byte[] digesta=alga.digest(); System.out.println("本信息摘要是:"+byte2hex(digesta)); //通过某中方式传给其他人你的信息(myinfo)和摘要(digesta) 对方可以判断是否更改或传输正常 java.security.MessageDigest algb=java.security.MessageDigest.getInstance("SHA-1"); algb.update(myinfo.getBytes()); if (algb.isEqual(digesta,algb.digest())) { System.out.println("信息检查正常"); } else { System.out.println("摘要不相同"); } } catch (java.security.NoSuchAlgorithmException ex) { System.out.println("非法摘要算法"); } } public String byte2hex(byte[] b) //二行制转字符串 { String hs=""; String stmp=""; for (int n=0;n<b.length;n++) { stmp=(java.lang.Integer.toHexString(b[n] & 0XFF)); if (stmp.length()==1) hs=hs+"0"+stmp; else hs=hs+stmp; if (n<b.length-1) hs=hs+":"; } return hs.toUpperCase(); } }
关于Java加密的更多信息:http://www.ibm.com/developerworks/cn/java/l-security/
--------------------------------------------------------------------------
本系列Heritrix 3.1.0 源码解析系本人原创
转载请注明出处 博客园 刺猬的温驯
本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/30/3052411.html