Examples of websphinx.RobotExclusion

websphinx.RobotExclusion

     */
    public IterativeHTMLCrawler(String url_list_file, String html_dump_directory, String userAgent) {
        this.url_list_file = url_list_file;
        this.html_dump_directory = html_dump_directory;


        robot = new RobotExclusion(userAgent);
    }

View Full Code Here

        log.debug("URI list file: " + this.url_list_file);


        this.html_dump_directory = ce.getHTDocsDumpDirResolved();
        log.debug("HTDocs Dump Dir: " + this.html_dump_directory);


        robot = new RobotExclusion(ce.getUserAgent());


        String robots_file = ce.getRobotsFileResolved();
        log.debug("Robots File: " + robots_file);
        String robots_domain = ce.getRobotsDomain();
        if (robots_file != null && robots_domain != null) {

View Full Code Here

     */
    public IterativeHTMLCrawler(String url_list_file, String html_dump_directory, String userAgent) {
        this.url_list_file = url_list_file;
        this.html_dump_directory = html_dump_directory;


        robot = new RobotExclusion(userAgent);
    }

View Full Code Here

        log.debug("URI list file: " + this.url_list_file);


        this.html_dump_directory = ce.getHTDocsDumpDirResolved();
        log.debug("HTDocs Dump Dir: " + this.html_dump_directory);


        robot = new RobotExclusion(ce.getUserAgent());


        String robots_file = ce.getRobotsFileResolved();
        log.debug("Robots File: " + robots_file);
        String robots_domain = ce.getRobotsDomain();
        if (robots_file != null && robots_domain != null) {

View Full Code Here

     * @param _userAgent User-agent for robots.txt
     */
    public IterativeHTMLCrawler(String _uriList, String _htdocsDumpDir, String _userAgent) {
        this.uriList = _uriList;
        this.htdocsDumpDir = _htdocsDumpDir;
        this.robot = new RobotExclusion(_userAgent);
    }

View Full Code Here

          System.err.println("Cannot load crawler configuration! " + e.toString());
        } catch (final IOException e) {
          System.err.println("Cannot load crawler configuration! " + e.toString());
        }


        this.robot = new RobotExclusion(this.userAgent);


        if (this.robotsFile != null && this.robotsDomain != null) {
            log.debug(this.robotsFile + " " + this.robotsDomain);
            this.robot.addLocalEntries(this.robotsDomain, new File(this.robotsFile));
        }

View Full Code Here

     */
    public IterativeHTMLCrawler(String url_list_file, String html_dump_directory, String userAgent) {
        this.url_list_file = url_list_file;
        this.html_dump_directory = html_dump_directory;


        robot = new RobotExclusion(userAgent);
    }

View Full Code Here

     */
    public IterativeHTMLCrawler(String url_list_file, String html_dump_directory, String userAgent) {
        this.url_list_file = url_list_file;
        this.html_dump_directory = html_dump_directory;


        robot = new RobotExclusion(userAgent);
    }

View Full Code Here

     */
    public IterativeHTMLCrawler(String url_list_file, String html_dump_directory, String userAgent) {
        this.url_list_file = url_list_file;
        this.html_dump_directory = html_dump_directory;


        robot = new RobotExclusion(userAgent);
    }

View Full Code Here




        this.url_list_file = ce.resolvePath(ce.getURIList());
        this.html_dump_directory = ce.resolvePath(ce.getHTDocsDumpDir());


        robot = new RobotExclusion(ce.getUserAgent());


        String robots_file = ce.getRobotsFile();
        String robots_domain = ce.getRobotsDomain();
        if (robots_file != null && robots_domain != null) {
            log.debug(robots_file + " " + robots_domain);

View Full Code Here

0 1

TOP

Related Classes of websphinx.RobotExclusion

org.apache.lenya.search.crawler.IterativeHTMLCrawler

java.io.PushbackInputStream

java.net.URLConnection

java.io.BufferedInputStream

java.net.URL

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.