Heritrix 3.1.0 源码解析(三十二)

本文要分析的是FetchDNS处理器,该处理器的功能是解析CrawlURI curi对象的DNS地址,该处理器是采用dnsjava-2.0.3.jar组件进行解析DNS的(我们可以参考本文代码采用dnsjava-2.0.3.jar组件API解析DNS)

FetchDNS处理器的重要成员变量

// Defaults.
    private short ClassType = DClass.IN;
    private short TypeType = Type.A;
    protected InetAddress serverInetAddr = null;

 /**
     * Used to do DNS lookups.
     */
    protected ServerCache serverCache;
    public ServerCache getServerCache() {
        return this.serverCache;
    }
    @Autowired
    public void setServerCache(ServerCache serverCache) {
        this.serverCache = serverCache;
    }
    
    /**
     * Whether or not to perform an on-the-fly digest hash of retrieved
     * content-bodies.
     */
    {
        setDigestContent(true);
    }
    public boolean getDigestContent() {
        return (Boolean) kp.get("digestContent");
    }
    public void setDigestContent(boolean digest) {
        kp.put("digestContent",digest);
    }

    /**
     * Which algorithm (for example MD5 or SHA-1) to use to perform an 
     * on-the-fly digest hash of retrieved content-bodies.
     */
    String digestAlgorithm = "sha1"; 
    public String getDigestAlgorithm() {
        return digestAlgorithm;
    }
    public void setDigestAlgorithm(String digestAlgorithm) {
        this.digestAlgorithm = digestAlgorithm;
    }

处理器void innerProcess(CrawlURI curi)方法

protected void innerProcess(CrawlURI curi) {
        Record[] rrecordSet = null; // Retrieved dns records
        String dnsName = null;
        try {
            dnsName = curi.getUURI().getReferencedHost();
        } catch (URIException e) {
            logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e);
        }
        
        if(dnsName == null) {
            curi.setFetchStatus(S_UNFETCHABLE_URI);
            return;
        }

        CrawlHost targetHost = getServerCache().getHostFor(dnsName);
        //IP地址转换为InetAddress类型
        if (isQuadAddress(curi, dnsName, targetHost)) {
            // We're done processing.
            return;
        }
        
        // Do actual DNS lookup.
        curi.setFetchBeginTime(System.currentTimeMillis());

        // Try to get the records for this host (assume domain name)
        // TODO: Bug #935119 concerns potential hang here
        String lookupName = dnsName.endsWith(".") ? dnsName : dnsName + ".";
        try {
            //DNS解析
            rrecordSet = (new Lookup(lookupName, TypeType, ClassType)).run();
        } catch (TextParseException e) {
            rrecordSet = null;
        }
        curi.setContentType("text/dns");
        if (rrecordSet != null) {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Found recordset for " + lookupName);
            }
            //设置CrawlHost targetHost对象的IP属性; CrawlURI curi对象的Recorder httpRecorder属性
            storeDNSRecord(curi, dnsName, targetHost, rrecordSet);
        } else {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Failed find of recordset for " + lookupName);
            }
            if (getAcceptNonDnsResolves()||"localhost".equals(dnsName)) {
                // Do lookup that bypasses javadns.
                InetAddress address = null;
                try {
                    address = InetAddress.getByName(dnsName);
                } catch (UnknownHostException e1) {
                    address = null;
                }
                if (address != null) {
                    targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES);
                    curi.setFetchStatus(S_GETBYNAME_SUCCESS);
                    if (logger.isLoggable(Level.FINE)) {
                        logger.fine("Found address for " + dnsName +
                            " using native dns.");
                    }
                } else {
                    if (logger.isLoggable(Level.FINE)) {
                        logger.fine("Failed find of address for " + dnsName +
                            " using native dns.");
                    }
                    setUnresolvable(curi, targetHost);
                }
            } else {
                setUnresolvable(curi, targetHost);
            }
        }
        curi.setFetchCompletedTime(System.currentTimeMillis());
    }

相关调用方法如下(dnsjava-2.0.3.jar组件的API) 

/**
     * 设置CrawlHost targetHost对象的IP属性; CrawlURI curi对象的Recorder httpRecorder属性
     * @param curi
     * @param dnsName
     * @param targetHost
     * @param rrecordSet
     */
    protected void storeDNSRecord(final CrawlURI curi, final String dnsName,
            final CrawlHost targetHost, final Record[] rrecordSet) {
        // Get TTL and IP info from the first A record (there may be
        // multiple, e.g. www.washington.edu) then update the CrawlServer
        ARecord arecord = getFirstARecord(rrecordSet);
        if (arecord == null) {
            throw new NullPointerException("Got null arecord for " +
                dnsName);
        }
        //设置CrawlHost targetHost对象IP属性
        targetHost.setIP(arecord.getAddress(), arecord.getTTL());
        try {
            //CrawlURI curi对象的Recorder httpRecorder属性
            recordDNS(curi, rrecordSet);
            curi.setFetchStatus(S_DNS_SUCCESS);
            curi.setDNSServerIPLabel(ResolverConfig.getCurrentConfig().server());
        } catch (IOException e) {
            logger.log(Level.SEVERE, "Failed store of DNS Record for " +
                curi.toString(), e);
            setUnresolvable(curi, targetHost);
        }
    }
    /**
     * IP地址转换为InetAddress
     * @param curi
     * @param dnsName
     * @param targetHost
     * @return
     */
    protected boolean isQuadAddress(final CrawlURI curi, final String dnsName,
            final CrawlHost targetHost) {
        boolean result = false;
        Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName);
        // If it's an ip no need to do a lookup
        if (matcher == null || !matcher.matches()) {
            return result;
        }
        
        result = true;
        // Ideally this branch would never be reached: no CrawlURI
        // would be created for numerical IPs
        if (logger.isLoggable(Level.WARNING)) {
            logger.warning("Unnecessary DNS CrawlURI created: " + curi);
        }
        try {
            targetHost.setIP(InetAddress.getByAddress(dnsName, new byte[] {
                    (byte) (new Integer(matcher.group(1)).intValue()),
                    (byte) (new Integer(matcher.group(2)).intValue()),
                    (byte) (new Integer(matcher.group(3)).intValue()),
                    (byte) (new Integer(matcher.group(4)).intValue()) }),
                    CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs
            curi.setFetchStatus(S_DNS_SUCCESS);
        } catch (UnknownHostException e) {
            logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e);
            setUnresolvable(curi, targetHost);
        }
        return result;
    }
    /**
     * 封装到CrawlURI curi对象的Recorder httpRecorder属性
     * @param curi
     * @param rrecordSet
     * @throws IOException
     */
    protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet)
            throws IOException {
        //转换为byte[]
        final byte[] dnsRecord = getDNSRecord(curi.getFetchBeginTime(),
                rrecordSet);

        Recorder rec = curi.getRecorder();
        // Shall we get a digest on the content downloaded?
        boolean digestContent = getDigestContent();
        String algorithm = null;
        if (digestContent) {
            algorithm = getDigestAlgorithm();
            rec.getRecordedInput().setDigest(algorithm);
        } else {
            rec.getRecordedInput().setDigest((MessageDigest)null);
        }
        //byte[]转换为InputStream,封装到CrawlURI curi对象的Recorder httpRecorder属性
        InputStream is = curi.getRecorder().inputWrap(
                new ByteArrayInputStream(dnsRecord));

        if (digestContent) {
            rec.getRecordedInput().startDigest();
        }

        // Reading from the wrapped stream, behind the scenes, will write
        // files into scratch space
        try {
            while (is.read(this.reusableBuffer) != -1) {
                continue;
            }
        } finally {
            is.close();
            rec.closeRecorders();
        }
        curi.setContentSize(dnsRecord.length);

        if (digestContent) {
            curi.setContentDigest(algorithm,
                rec.getRecordedInput().getDigestValue());
        }
    }
    /**
     * 转换为byte[]
     * @param fetchStart
     * @param rrecordSet
     * @return
     * @throws IOException
     */
    protected byte [] getDNSRecord(final long fetchStart,
            final Record[] rrecordSet)
    throws IOException {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        // Start the record with a 14-digit date per RFC 2540
        byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes();
        baos.write(fetchDate);
        // Don't forget the newline
        baos.write("\n".getBytes());
        int recordLength = fetchDate.length + 1;
        if (rrecordSet != null) {
            for (int i = 0; i < rrecordSet.length; i++) {
                byte[] record = rrecordSet[i].toString().getBytes();
                recordLength += record.length;
                baos.write(record);
                // Add the newline between records back in
                baos.write("\n".getBytes());
                recordLength += 1;
            }
        }
        return baos.toByteArray();
    }
    
    protected void setUnresolvable(CrawlURI curi, CrawlHost host) {
        host.setIP(null, 0);
        curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE); 
    }
    /**
     * 返回Record[] rrecordSet数组Type.A类型的Record元素
     * @param rrecordSet
     * @return
     */
    protected ARecord getFirstARecord(Record[] rrecordSet) {
        ARecord arecord = null;
        if (rrecordSet == null || rrecordSet.length == 0) {
            if (logger.isLoggable(Level.FINEST)) {
                logger.finest("rrecordSet is null or zero length: " +
                    rrecordSet);
            }
            return arecord;
        }
        for (int i = 0; i < rrecordSet.length; i++) {
            if (rrecordSet[i].getType() != Type.A) {
                if (logger.isLoggable(Level.FINEST)) {
                    logger.finest("Record " + Integer.toString(i) +
                        " is not A type but " + rrecordSet[i].getType());
                }
                continue;
            }
            arecord = (ARecord) rrecordSet[i];
            break;
        }
        return arecord;
    }

FetchDNS处理器和后面的FetchHTTP处理器涉及到消息摘要算法MessageDigest digest 对象,我这里转自网上的一篇文章供参考 

转自 http://huangyunbin.iteye.com/blog/1123442

MessageDigest的功能及用法

MessageDigest 类为应用程序提供信息摘要算法的功能,如 MD5 或 SHA 算法。信息摘要是安全的单向哈希函数,它接收任意大小的数据,并输出固定长度的哈希值。 

MessageDigest 对象开始被初始化。该对象通过使用 update()方法处理数据。任何时候都可以调用 reset()方法重置摘要。一旦所有需要更新的数据都已经被更新了,应该调用digest() 方法之一完成哈希计算。 

对于给定数量的更新数据,digest 方法只能被调用一次。在调用 digest 之后,MessageDigest 对象被重新设置成其初始状态。 

1、public static MessageDigest getInstance(String algorithm) 
                                 throws NoSuchAlgorithmException 

   返回实现指定摘要算法的 MessageDigest 对象。 

   algorithm - 所请求算法的名称 

2、public static MessageDigest getInstance(String algorithm, 
                                        String provider) 
                                 throws NoSuchAlgorithmException, 
                                        NoSuchProviderException 

  返回实现指定摘要算法的 MessageDigest 对象。 

  algorithm - 所请求算法的名称 

  provider - 提供者的名称。 

3、public void update(byte[] input) 

  使用指定的 byte 数组更新摘要。 

4、public byte[] digest() 

  通过执行诸如填充之类的最终操作完成哈希计算。在调用此方法之后,摘要被重置。 

5、public static boolean isEqual(byte[] digesta, 
                              byte[] digestb) 

    比较两个摘要的相等性。做简单的字节比较。 


注意:Provider可以通过 java.security.Security.getProviders() 方法获取已注册提供者列表。比较常用的有“SUN” 

SUN提供的常用的算法名称有:MD2 
MD5 
                        SHA-1 
                        SHA-256 
                        SHA-384 
                        SHA-512 

Code举例: 

import java.security.*; 
public class myDigest { 
  public static void main(String[] args)  { 
    myDigest my=new myDigest(); 
    my.testDigest(); 
  } 
  public void testDigest() 
  { 
   try { 
     String myinfo="我的测试信息"; 
    //java.security.MessageDigest alg=java.security.MessageDigest.getInstance("MD5"); 
      java.security.MessageDigest alga=java.security.MessageDigest.getInstance("SHA-1"); 
      alga.update(myinfo.getBytes()); 
      byte[] digesta=alga.digest(); 
      System.out.println("本信息摘要是:"+byte2hex(digesta)); 
      //通过某中方式传给其他人你的信息(myinfo)和摘要(digesta) 对方可以判断是否更改或传输正常 
      java.security.MessageDigest algb=java.security.MessageDigest.getInstance("SHA-1"); 
      algb.update(myinfo.getBytes()); 
      if (algb.isEqual(digesta,algb.digest())) { 
         System.out.println("信息检查正常"); 
       } 
       else 
        { 
          System.out.println("摘要不相同"); 
         } 
   } 
   catch (java.security.NoSuchAlgorithmException ex) { 
     System.out.println("非法摘要算法"); 
   } 
  } 
  public String byte2hex(byte[] b) //二行制转字符串 
    { 
     String hs=""; 
     String stmp=""; 
     for (int n=0;n<b.length;n++) 
      { 
       stmp=(java.lang.Integer.toHexString(b[n] & 0XFF)); 
       if (stmp.length()==1) hs=hs+"0"+stmp; 
       else hs=hs+stmp; 
       if (n<b.length-1)  hs=hs+":"; 
      } 
     return hs.toUpperCase(); 
    } 
} 

关于Java加密的更多信息:http://www.ibm.com/developerworks/cn/java/l-security/

--------------------------------------------------------------------------

本系列Heritrix 3.1.0 源码解析系本人原创

转载请注明出处 博客园 刺猬的温驯

本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/30/3052411.html

原文地址:https://www.cnblogs.com/chenying99/p/3052411.html