Programming a Spider in Java 源码帖

Programming a Spider in Java 源码帖
Listing 1: Finding the bad links (CheckLinks.java)
import java.awt.*;
import javax.swing.*;
import java.net.*;
import java.io.*;
/**
* This example uses a Java spider to scan a Web site
* and check for broken links. Written by Jeff Heaton.
* Jeff Heaton is the author of "Programming Spiders,
* Bots, and Aggregators" by Sybex. Jeff can be contacted
* through his Web site at http://www.jeffheaton.com.
* 
* @author Jeff Heaton(http://www.jeffheaton.com)
* @version 1.0
*/
public class CheckLinks extends javax.swing.JFrame implements Runnable,ISpiderReportable {
   /**
    * The constructor. Perform setup here.
    */
   public CheckLinks() {
     //{{INIT_CONTROLS
     setTitle("Find Broken Links");
     getContentPane().setLayout(null);
     setSize(405,288);
     setVisible(false);
     label1.setText("Enter a URL:");
     getContentPane().add(label1);
     label1.setBounds(12,12,84,12);
     begin.setText("Begin");
     begin.setActionCommand("Begin");
     getContentPane().add(begin);
     begin.setBounds(12,36,84,24);
     getContentPane().add(url);
     url.setBounds(108,36,288,24);
     errorScroll.setAutoscrolls(true);
     errorScroll.setHorizontalScrollBarPolicy(javax.swing.ScrollPaneConstants.HORIZONTAL_SCROLLBAR_ALWAYS);
     errorScroll.setVerticalScrollBarPolicy(javax.swing.ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);
     errorScroll.setOpaque(true);
     getContentPane().add(errorScroll);
     errorScroll.setBounds(12,120,384,156);
     errors.setEditable(false);
     errorScroll.getViewport().add(errors);
     errors.setBounds(0,0,366,138);
     current.setText("Currently Processing: ");
     getContentPane().add(current);
     current.setBounds(12,72,384,12);
     goodLinksLabel.setText("Good Links: 0");
     getContentPane().add(goodLinksLabel);
     goodLinksLabel.setBounds(12,96,192,12);
     badLinksLabel.setText("Bad Links: 0");
     getContentPane().add(badLinksLabel);
     badLinksLabel.setBounds(216,96,96,12);
     //}}
     //{{INIT_MENUS
     //}}
     //{{REGISTER_LISTENERS
     SymAction lSymAction = new SymAction();
     begin.addActionListener(lSymAction);
     //}}
   }
   /**
    * Main method for the application
    * 
    * @param args Not used
    */
   static public void main(String args[]){
     (new CheckLinks()).setVisible(true);
   }
   /**
    * Add notifications.
    */
   public void addNotify(){
     // Record the size of the window prior to calling parent's
     // addNotify.
     Dimension size = getSize();
     super.addNotify();
     if ( frameSizeAdjusted )
       return;
     frameSizeAdjusted = true;
     // Adjust size of frame according to the insets and menu bar
     Insets insets = getInsets();
     javax.swing.JMenuBar menuBar = getRootPane().getJMenuBar();
     int menuBarHeight = 0;
     if ( menuBar != null )
       menuBarHeight = menuBar.getPreferredSize().height;
     setSize(insets.left + insets.right + size.width, insets.top + insets.bottom + size.height + menuBarHeight);
   }
   // Used by addNotify
   boolean frameSizeAdjusted = false;
   //{{DECLARE_CONTROLS
   javax.swing.JLabel label1 = new javax.swing.JLabel();
   /**
    * The begin or cancel button
    */
   javax.swing.JButton begin = new javax.swing.JButton();
   /**
    * The URL being processed
    */
   javax.swing.JTextField url = new javax.swing.JTextField();
   /**
    * Scroll the errors.
    */
   javax.swing.JScrollPane errorScroll = new javax.swing.JScrollPane();
   /**
    * A place to store the errors created
    */
   javax.swing.JTextArea errors = new javax.swing.JTextArea();
   javax.swing.JLabel current = new javax.swing.JLabel();
   javax.swing.JLabel goodLinksLabel = new javax.swing.JLabel();
   javax.swing.JLabel badLinksLabel = new javax.swing.JLabel();
   //}}
   //{{DECLARE_MENUS
   //}}
   /**
    * The background spider thread
    */
   protected Thread backgroundThread;
   /**
    * The spider object being used
    */
   protected Spider spider;
   /**
    * The URL that the spider began with
    */
   protected URL base;
   /**
    * How many bad links have been found
    */
   protected int badLinksCount = 0;
   /**
    * How many good links have been found
    */
   protected int goodLinksCount = 0;

   /**
    * Internal class used to dispatch events
    * 
    * @author Jeff Heaton
    * @version 1.0
    */
   class SymAction implements java.awt.event.ActionListener {
     public void actionPerformed(java.awt.event.ActionEvent event){
       Object object = event.getSource();
       if ( object == begin )
         begin_actionPerformed(event);
     }
   }
   /**
    * Called when the begin or cancel buttons are clicked
    * 
    * @param event The event associated with the button.
    */
   void begin_actionPerformed(java.awt.event.ActionEvent event){
     if ( backgroundThread==null ) {
       begin.setLabel("Cancel");
       backgroundThread = new Thread(this);
       backgroundThread.start();
       goodLinksCount=0;
       badLinksCount=0;
     } else {
       spider.cancel();
     }
   }
   /**
    * Perform the background thread operation. This method
    * actually starts the background thread.
    */
   public void run(){
     try {
       errors.setText("");
       spider = new Spider(this);
       spider.clear();
       base = new URL(url.getText());
       spider.addURL(base);
       spider.begin();
       Runnable doLater = new Runnable(){
         public void run(){
           begin.setText("Begin");
         }
       };
       SwingUtilities.invokeLater(doLater);
       backgroundThread=null;
     } catch ( MalformedURLException e ) {
       UpdateErrors err = new UpdateErrors();
       err.msg = "Bad address.";
       SwingUtilities.invokeLater(err);
     }
   }
   /**
    * Called by the spider when a URL is found. It is here
    * that links are validated.
    * 
    * @param base The page that the link was found on.
    * @param url The actual link address.
    */
   public boolean spiderFoundURL(URL base,URL url){
     UpdateCurrentStats cs = new UpdateCurrentStats();
     cs.msg = url.toString();
     SwingUtilities.invokeLater(cs);
     if ( !checkLink(url) ) {
       UpdateErrors err = new UpdateErrors();
       err.msg = url+"(on page " + base + ")\n";
       SwingUtilities.invokeLater(err);
       badLinksCount++;
       return false;
     }
     goodLinksCount++;
     if ( !url.getHost().equalsIgnoreCase(base.getHost()) )
       return false;
     else
       return true;
   }
   /**
    * Called when a URL error is found
    * 
    * @param url The URL that resulted in an error.
    */
   public void spiderURLError(URL url){
   }
   /**
    * Called internally to check whether a link is good
    * 
    * @param url The link that is being checked.
    * @return True if the link was good, false otherwise.
    */
   protected boolean checkLink(URL url){
     try {
       URLConnection connection = url.openConnection();
       connection.connect();
       return true;
     } catch ( IOException e ) {
       return false;
     }
   }
   /**
    * Called when the spider finds an e-mail address
    * 
    * @param email The email address the spider found.
    */
   public void spiderFoundEMail(String email){
   }
   /**
    * Internal class used to update the error information
    * in a Thread-Safe way
    * 
    * @author Jeff Heaton
    * @version 1.0
    */
   class UpdateErrors implements Runnable {
     public String msg;
     public void run(){
       errors.append(msg);
     }
   }
   /**
    * Used to update the current status information
    * in a "Thread-Safe" way
    * 
    * @author Jeff Heaton
    * @version 1.0
    */
   class UpdateCurrentStats implements Runnable {
     public String msg;
     public void run(){
       current.setText("Currently Processing: " + msg );
       goodLinksLabel.setText("Good Links: " + goodLinksCount);
       badLinksLabel.setText("Bad Links: " + badLinksCount);
     }
   }
}
Listing 2: Reporting spider events(ISpiderReportable.java)
import java.net.*;
interface ISpiderReportable {
   public boolean spiderFoundURL(URL base,URL url);
   public void spiderURLError(URL url);
   public void spiderFoundEMail(String email);
}
Listing 3: A reusable spider (Spider.java)
import java.util.*;
import java.net.*;
import java.io.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
/**
* That class implements a reusable spider
* 
* @author Jeff Heaton(http://www.jeffheaton.com)
* @version 1.0
*/
public class Spider {
   /**
    * A collection of URLs that resulted in an error
    */
   protected Collection workloadError = new ArrayList(3);
   /**
    * A collection of URLs that are waiting to be processed
    */
   protected Collection workloadWaiting = new ArrayList(3);
   /**
    * A collection of URLs that were processed
    */
   protected Collection workloadProcessed = new ArrayList(3);
   /**
    * The class that the spider should report its URLs to
    */
   protected ISpiderReportable report;
   /**
    * A flag that indicates whether this process
    * should be canceled
    */
   protected boolean cancel = false;
   /**
    * The constructor
    * 
    * @param report A class that implements the ISpiderReportable
    * interface, that will receive information that the
    * spider finds.
    */
   public Spider(ISpiderReportable report){
     this.report = report;
   }
   /**
    * Get the URLs that resulted in an error.
    * 
    * @return A collection of URL's.
    */
   public Collection getWorkloadError(){
     return workloadError;
   }
   /**
    * Get the URLs that were waiting to be processed.
    * You should add one URL to this collection to
    * begin the spider.
    * 
    * @return A collection of URLs.
    */
   public Collection getWorkloadWaiting(){
     return workloadWaiting;
   }
   /**
    * Get the URLs that were processed by this spider.
    * 
    * @return A collection of URLs.
    */
   public Collection getWorkloadProcessed(){
     return workloadProcessed;
   }    
   /**
    * Clear all of the workloads.
    */
   public void clear(){
     getWorkloadError().clear();
     getWorkloadWaiting().clear();
     getWorkloadProcessed().clear();
   }
   /**
    * Set a flag that will cause the begin
    * method to return before it is done.
    */
   public void cancel(){
     cancel = true;
   }
   /**
    * Add a URL for processing.
    * 
    * @param url
    */
   public void addURL(URL url){
     if ( getWorkloadWaiting().contains(url) )
       return;
     if ( getWorkloadError().contains(url) )
       return;
     if ( getWorkloadProcessed().contains(url) )
       return;
     log("Adding to workload: " + url );
     getWorkloadWaiting().add(url);
   }
   /**
    * Called internally to process a URL
    * 
    * @param url The URL to be processed.
    */
   public void processURL(URL url){
     try {
       log("Processing: " + url );
       // get the URL's contents
       URLConnection connection = url.openConnection();
       if ( (connection.getContentType()!=null) && !connection.getContentType().toLowerCase().startsWith("text/") ) {
         getWorkloadWaiting().remove(url);
         getWorkloadProcessed().add(url);
         log("Not processing because content type is: " + connection.getContentType() );
         return;
       }
      
       // read the URL
       InputStream is = connection.getInputStream();
       Reader r = new InputStreamReader(is);
       // parse the URL
       HTMLEditorKit.Parser parse = new HTMLParse().getParser();
       parse.parse(r,new Parser(url),true);
     } catch ( IOException e ) {
       getWorkloadWaiting().remove(url);
       getWorkloadError().add(url);
       log("Error: " + url );
       report.spiderURLError(url);
       return;
     }
     // mark URL as complete
     getWorkloadWaiting().remove(url);
     getWorkloadProcessed().add(url);
     log("Complete: " + url );
   }
   /**
    * Called to start the spider
    */
   public void begin(){
     cancel = false;
     while ( !getWorkloadWaiting().isEmpty() && !cancel ) {
       Object list[] = getWorkloadWaiting().toArray();
       for ( int i=0;(i<list.length)&&!cancel;i++ )
         processURL((URL)list[i]);
     }
   }
/**
* A HTML parser callback used by this class to detect links
* 
* @author Jeff Heaton
* @version 1.0
*/
   protected class Parser
   extends HTMLEditorKit.ParserCallback {
     protected URL base;
     public Parser(URL base){
       this.base = base;
     }
     public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a,int pos){
       String href = (String)a.getAttribute(HTML.Attribute.HREF);
      
       if( (href==null) && (t==HTML.Tag.FRAME) )
         href = (String)a.getAttribute(HTML.Attribute.SRC);
        
       if ( href==null )
         return;
       int i = href.indexOf('#');
       if ( i!=-1 )
         href = href.substring(0,i);
       if ( href.toLowerCase().startsWith("mailt") ) {
         report.spiderFoundEMail(href);
         return;
       }
       handleLink(base,href);
     }
     public void handleStartTag(HTML.Tag t, MutableAttributeSet a,int pos){
       handleSimpleTag(t,a,pos);     // handle the same way
     }
     protected void handleLink(URL base,String str){
       try {
         URL url = new URL(base,str);
         if ( report.spiderFoundURL(base,url) )
           addURL(url);
       } catch ( MalformedURLException e ) {
         log("Found malformed URL: " + str );
       }
     }
   }
   /**
    * Called internally to log information
    * This basic method just writes the log
    * out to the stdout.
    * 
    * @param entry The information to be written to the log.
    */
   public void log(String entry){
     System.out.println( (new Date()) + ":" + entry );
   }
}
Listing 4: Parsing HTML (HTMLParse.java)
import javax.swing.text.html.*;
public class HTMLParse extends HTMLEditorKit {
   public HTMLEditorKit.Parser getParser(){
     return super.getParser();
   }
}
posted @ 2014-02-10 17:39  cRaZy_TyKeIo  阅读(218)  评论(0编辑  收藏  举报