Package org.apache.lenya.search.crawler

Source Code of org.apache.lenya.search.crawler.IterativeHTMLCrawler

/*
* $Id: IterativeHTMLCrawler.java,v 1.15 2003/04/24 13:53:14 gregor Exp $
* <License>
* The Apache Software License
*
* Copyright (c) 2002 lenya. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
*    list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice, this
*    list of conditions and the following disclaimer in the documentation and/or
*    other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
*    display the following acknowledgment: "This product includes software developed
*    by lenya (http://www.lenya.org)"
*
* 4. The name "lenya" must not be used to endorse or promote products derived from
*    this software without prior written permission. For written permission, please
*    contact contact@lenya.org
*
* 5. Products derived from this software may not be called "lenya" nor may "lenya"
*    appear in their names without prior written permission of lenya.
*
* 6. Redistributions of any form whatsoever must retain the following acknowledgment:
*    "This product includes software developed by lenya (http://www.lenya.org)"
*
* THIS SOFTWARE IS PROVIDED BY lenya "AS IS" WITHOUT ANY WARRANTY EXPRESS OR IMPLIED,
* INCLUDING THE WARRANTY OF NON-INFRINGEMENT AND THE IMPLIED WARRANTIES OF MERCHANTI-
* BILITY AND FITNESS FOR A PARTICULAR PURPOSE. lenya WILL NOT BE LIABLE FOR ANY DAMAGES
* SUFFERED BY YOU AS A RESULT OF USING THIS SOFTWARE. IN NO EVENT WILL lenya BE LIABLE
* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR LOST PROFITS EVEN IF lenya HAS
* BEEN ADVISED OF THE POSSIBILITY OF THEIR OCCURRENCE. lenya WILL NOT BE LIABLE FOR ANY
* THIRD PARTY CLAIMS AGAINST YOU.
*
* Lenya includes software developed by the Apache Software Foundation, W3C,
* DOM4J Project, BitfluxEditor and Xopus.
* </License>
*/
package org.apache.lenya.search.crawler;

import websphinx.RobotExclusion;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;

import java.io.InputStreamReader;
import java.io.BufferedInputStream;
import java.io.BufferedReader;

import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;

import java.util.StringTokenizer;


/**
* DOCUMENT ME!
*
* @author $author$
* @version $Revision: 1.15 $
*/
public class IterativeHTMLCrawler {
    java.util.Vector urlsToCrawl;
    java.util.TreeSet urlsToCrawlLowerCase;
    String url_list_file = "url_file.txt";
    String html_dump_directory = "html_dump";
    private String rootURL;

    private String[] scopeURL;
    private RobotExclusion robot;

    /**
     * Creates a new IterativeHTMLCrawler object.
     *
     * @param url_list_file DOCUMENT ME!
     * @param html_dump_directory DOCUMENT ME!
     * @param userAgent DOCUMENT ME!
     */
    public IterativeHTMLCrawler(String url_list_file, String html_dump_directory, String userAgent) {
        this.url_list_file = url_list_file;
        this.html_dump_directory = html_dump_directory;

        robot = new RobotExclusion(userAgent);
    }

    /**
     * DOCUMENT ME!
     *
     * @param args DOCUMENT ME!
     */
    public static void main(String[] args) {
        if (args.length != 1) {
            System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");

            return;
        }

        try {
            CrawlerConfiguration ce = new CrawlerConfiguration(args[0]);
            new IterativeHTMLCrawler(ce.resolvePath(ce.getURIList()),
                ce.resolvePath(ce.getHTDocsDumpDir()), ce.getUserAgent()).crawl(new URL(
                    ce.getBaseURL()), ce.getScopeURL());
        } catch (MalformedURLException e) {
            System.err.println(e);
        }
    }

    /**
     * DOCUMENT ME!
     *
     * @param start DOCUMENT ME!
     * @param scope DOCUMENT ME!
     */
    public void crawl(URL start, String scope) {
        scopeURL = new String[1];
        scopeURL[0] = scope;

        String seedURL = start.toString();
        this.rootURL = seedURL.substring(0, seedURL.indexOf("/", 8));

        urlsToCrawl = new java.util.Vector();
        urlsToCrawlLowerCase = new java.util.TreeSet();

        String currentURLPath = start.toString().substring(0, start.toString().lastIndexOf("/"));

        try {
            System.out.println(".crawl(): Start crawling at: " + start);
            if (addURL(start.getFile(), currentURLPath) != null) {
                dumpHTDoc(start);
            }
            else {
                System.err.println(".crawl(): Start URL has not been dumped: " + start);
            }
        } catch (MalformedURLException e) {
            System.err.println(".crawl(): ERROR: " + e);
        }

        int currentPosition = 0;

        while (currentPosition < urlsToCrawl.size()) {
            URL currentURL = (URL) urlsToCrawl.elementAt(currentPosition);
            currentURLPath = currentURL.toString().substring(0,
                    currentURL.toString().lastIndexOf("/"));

            System.out.println(".crawl(): INFO: Current Array Size: " + urlsToCrawl.size() +
                ", Current Position: " + currentPosition + ", Current URL: " +
                currentURL.toString());

            java.util.List urlsWithinPage = parsePage(currentURL.toString());

            if (urlsWithinPage != null) {
                java.util.Iterator iterator = urlsWithinPage.iterator();

                while (iterator.hasNext()) {
                    String urlCandidate = (String) iterator.next();

                    try {
                        URL urlToCrawl = null;

                        if ((urlToCrawl = addURL(urlCandidate, currentURLPath)) != null) {
                            dumpHTDoc(urlToCrawl);
                        }
                    } catch (MalformedURLException e) {
                        System.err.println(".crawl(): ERROR: " + e);
                    }
                }
            }

            currentPosition = currentPosition + 1;
        }

        try {
            java.io.PrintWriter out = new java.io.PrintWriter(new java.io.FileOutputStream(
                        url_list_file));

            for (int i = 0; i < urlsToCrawl.size(); i++) {
                out.println("" + (URL) urlsToCrawl.elementAt(i));
            }

            out.close();
        } catch (java.io.FileNotFoundException e) {
            System.err.println(".crawl(): ERROR: " + e);
        }
    }

    /**
     * DOCUMENT ME!
     *
     * @param urlCandidate DOCUMENT ME!
     * @param currentURLPath DOCUMENT ME!
     *
     * @return DOCUMENT ME!
     *
     * @throws MalformedURLException DOCUMENT ME!
     */
    public URL addURL(String urlCandidate, String currentURLPath)
        throws MalformedURLException {

        URL url = new URL(parseHREF(urlCandidate, urlCandidate.toLowerCase(), currentURLPath)); //completeURL(currentURL,urlCandidate)  new URL(currentURLPath+"/"+urlCandidate);

        if (filterURL(urlCandidate, currentURLPath, urlsToCrawlLowerCase)) {
            if (!robot.disallowed(url)) {
                urlsToCrawl.add(url);
                urlsToCrawlLowerCase.add(url.toString().toLowerCase());
                System.out.println(".addURL(): INFO: URL added: " + url);

                return url;
            } else {
                System.out.println(".addURL(): INFO: Disallowed by robots.txt: " + urlCandidate);
            }
        }
        return null;
    }

    /**
     * DOCUMENT ME!
     *
     * @param urlString DOCUMENT ME!
     *
     * @return ok, 404
     */
    public java.util.List parsePage(String urlString) {

        String status = "ok";

        try {
            URL currentURL = new java.net.URL(urlString);
            String currentURLPath = urlString.substring(0, urlString.lastIndexOf("/"));
            HttpURLConnection httpCon = (HttpURLConnection) currentURL.openConnection();

            httpCon.setRequestProperty("User-Agent", "Lenya Lucene Crawler");

            httpCon.connect();

            long lastModified = httpCon.getLastModified();

            if (httpCon.getResponseCode() == HttpURLConnection.HTTP_OK) {
                String contentType = httpCon.getContentType();

                if (contentType.indexOf("text/html") != -1) {
                    return handleHTML(httpCon);
                } else if (contentType.indexOf("application/pdf") != -1) {
                    handlePDF(httpCon);
                } else {
                    status = "Not an excepted content type : " + contentType;
                }
            } else {
                status = "bad";
            }

            httpCon.disconnect();
        } catch (java.net.MalformedURLException mue) {
            status = mue.toString();
        } catch (java.net.UnknownHostException uh) {
            status = uh.toString(); // Mark as a bad URL
        } catch (java.io.IOException ioe) {
            status = ioe.toString(); // Mark as a bad URL
        } catch (Exception e) {
            status = e.toString(); // Mark as a bad URL
        }

        //return status;
        return null;
    }

    /**
     * DOCUMENT ME!
     *
     * @param httpCon DOCUMENT ME!
     *
     * @return DOCUMENT ME!
     *
     * @throws java.io.IOException DOCUMENT ME!
     */
    public static java.util.List handleHTML(HttpURLConnection httpCon)
        throws java.io.IOException {
        ContentHandler handler = new HTMLHandler();
        handler.parse(httpCon.getInputStream());

        if (handler.getRobotFollow()) {
            java.util.List links = handler.getLinks();
      return links;
        }

        return null;
    }

    /**
     * DOCUMENT ME!
     *
     * @param httpCon DOCUMENT ME!
     */
    public void handlePDF(HttpURLConnection httpCon) {
        System.err.println(".handlePDF(): Not handled yet!");
    }

    /**
     * DOCUMENT ME!
     *
     * @param url DOCUMENT ME!
     * @param currentURLPath DOCUMENT ME!
     * @param links DOCUMENT ME!
     *
     * @return DOCUMENT ME!
     */
    public boolean filterURL(String url, String currentURLPath, java.util.TreeSet links) {

        String urlLowCase = url.toLowerCase();

        if (!(urlLowCase.startsWith("http://") || urlLowCase.startsWith("https://"))) {
            url = parseHREF(url, urlLowCase, currentURLPath);

            if (url != null) {
                urlLowCase = url.toLowerCase();
            }
        }

        if ((url != null) && inScope(url)) {
            if (!links.contains(urlLowCase)) {
                return true;
            }
        } else {
            System.out.println(".filterURL(): Not in scope: "+url);
        }

        return false;
    }

    /**
     * DOCUMENT ME!
     *
     * @param url DOCUMENT ME!
     * @param urlLowCase DOCUMENT ME!
     * @param currentURLPath DOCUMENT ME!
     *
     * @return DOCUMENT ME!
     */
    public String parseHREF(String url, String urlLowCase, String currentURLPath) {
        if (urlLowCase.startsWith("http://") || urlLowCase.startsWith("https://")) {
            return url;
        }

        // Looks for incomplete URL and completes them
        if (urlLowCase.startsWith("/")) {
            url = rootURL + url;
        } else if (urlLowCase.startsWith("./")) {
            url = currentURLPath + url.substring(1, url.length());
        } else if (urlLowCase.startsWith("../")) {
            int back = 1;

            while (urlLowCase.indexOf("../", back * 3) != -1)
                back++;

            int pos = currentURLPath.length();
            int count = back;

            while (count-- > 0) {
                pos = currentURLPath.lastIndexOf("/", pos) - 1;
            }

            url = currentURLPath.substring(0, pos + 2) + url.substring(3 * back, url.length());
        } else if (urlLowCase.startsWith("javascript:")) {
            // handle javascript:...
            System.err.println(".parseHREF(): WARN: \"javascript:\" is not implemented yet!");
            url = null;
        } else if (urlLowCase.startsWith("#")) {
            System.err.println(".parseHREF(): WARN: \"#\" (anchor) will be irgnored!");
            // internal anchor... ignore.
            url = null;
        } else if (urlLowCase.startsWith("mailto:")) {
            System.err.println(".parseHREF(): WARN: \"mailto:\" is not a URL to be followed!");
            // handle mailto:...
            url = null;
        } else {
            url = currentURLPath + "/" + url;
        }

        // strip anchor if exists otherwise crawler may index content multiple times
        // links to the same url but with unique anchors would be considered unique
        // by the crawler when they should not be
        if (url != null) {
            int i;

            if ((i = url.indexOf("#")) != -1) {
                url = url.substring(0, i);
            }
        }

        return url;
    }

    /**
     * DOCUMENT ME!
     *
     * @param url DOCUMENT ME!
     *
     * @return DOCUMENT ME!
     */
    public boolean inScope(String url) {
        for (int i = 0; i < scopeURL.length; i++) {
            if (url.startsWith(scopeURL[i])) {
                return true;
            }
        }

        return false;
    }

    /**
     * DOCUMENT ME!
     *
     * @param parent DOCUMENT ME!
     * @param child DOCUMENT ME!
     *
     * @return DOCUMENT ME!
     *
     * @throws MalformedURLException DOCUMENT ME!
     */
    public URL completeURL(URL parent, String child) throws MalformedURLException {
        return parent;
    }

    /**
     * DOCUMENT ME!
     *
     * @param url DOCUMENT ME!
     */
    public void dumpHTDoc(URL url) {
        String ext = getExtension(url);

        String filename = html_dump_directory + url.getFile();
        File file = new File(filename);

        if (filename.charAt(filename.length() - 1) == '/') {
            file = new File(filename + "index.html");
            ext = getExtension(file);
        }

        if (ext.equals("html") || ext.equals("htm") || ext.equals("txt") || ext.equals("pdf")) {
            try {
                File parent = new File(file.getParent());

                if (!parent.exists()) {
                    parent.mkdirs();
                }

                HttpURLConnection httpConnection = (HttpURLConnection) url.openConnection();
                java.io.InputStream in = httpConnection.getInputStream();
                BufferedInputStream bin = new BufferedInputStream(in);
                BufferedReader reader = new BufferedReader(new InputStreamReader(bin));

    java.io.FileWriter fw = new java.io.FileWriter(file);
    int i;
    while ((i = reader.read()) != -1) {
        fw.write(i);
    }
   
    fw.close();
   
    bin.close();
    in.close();
    httpConnection.disconnect();

                System.out.println(".dumpHTDoc(): INFO: URL dumped: " + url);
            } catch (Exception e) {
                System.err.println(".dumpHTDoc(): ERROR: " + e);
                System.out.println(".dumpHTDoc(): ERROR: URL not dumped: " + url);
            }
        } else {
            System.out.println(".dumpHTDoc(): INFO: URL not dumped: " + url);
        }
    }

    /**
     *
     */
    public void saveToFile(String filename, byte[] bytes)
        throws FileNotFoundException, IOException {
        File file = new File(filename);

        if (filename.charAt(filename.length() - 1) == '/') {
            file = new File(filename + "index.html");
        }

        File parent = new File(file.getParent());

        if (!parent.exists()) {
            System.out.println(".saveToFile(): Directory will be created: " +
                parent.getAbsolutePath());
            parent.mkdirs();
        }

        FileOutputStream out = new FileOutputStream(file.getAbsolutePath());
        out.write(bytes);
        out.close();
    }

    /**
     * DOCUMENT ME!
     *
     * @param url DOCUMENT ME!
     *
     * @return DOCUMENT ME!
     */
    public String getExtension(URL url) {
        return getExtension(new File(url.getPath()));
    }

    /**
     * DOCUMENT ME!
     *
     * @param file DOCUMENT ME!
     *
     * @return DOCUMENT ME!
     */
    public String getExtension(File file) {
        StringTokenizer st = new StringTokenizer(file.getPath(), ".");
        String extension = null;

        while (st.hasMoreElements()) {
            extension = st.nextToken();
        }

        return extension;
    }
}
TOP

Related Classes of org.apache.lenya.search.crawler.IterativeHTMLCrawler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.