Package websphinx

Examples of websphinx.RobotExclusion


     */
    public IterativeHTMLCrawler(String url_list_file, String html_dump_directory, String userAgent) {
        this.url_list_file = url_list_file;
        this.html_dump_directory = html_dump_directory;

        robot = new RobotExclusion(userAgent);
    }
View Full Code Here


        log.debug("URI list file: " + this.url_list_file);

        this.html_dump_directory = ce.getHTDocsDumpDirResolved();
        log.debug("HTDocs Dump Dir: " + this.html_dump_directory);

        robot = new RobotExclusion(ce.getUserAgent());

        String robots_file = ce.getRobotsFileResolved();
        log.debug("Robots File: " + robots_file);
        String robots_domain = ce.getRobotsDomain();
        if (robots_file != null && robots_domain != null) {
View Full Code Here

     */
    public IterativeHTMLCrawler(String url_list_file, String html_dump_directory, String userAgent) {
        this.url_list_file = url_list_file;
        this.html_dump_directory = html_dump_directory;

        robot = new RobotExclusion(userAgent);
    }
View Full Code Here

        log.debug("URI list file: " + this.url_list_file);

        this.html_dump_directory = ce.getHTDocsDumpDirResolved();
        log.debug("HTDocs Dump Dir: " + this.html_dump_directory);

        robot = new RobotExclusion(ce.getUserAgent());

        String robots_file = ce.getRobotsFileResolved();
        log.debug("Robots File: " + robots_file);
        String robots_domain = ce.getRobotsDomain();
        if (robots_file != null && robots_domain != null) {
View Full Code Here

     * @param _userAgent User-agent for robots.txt
     */
    public IterativeHTMLCrawler(String _uriList, String _htdocsDumpDir, String _userAgent) {
        this.uriList = _uriList;
        this.htdocsDumpDir = _htdocsDumpDir;
        this.robot = new RobotExclusion(_userAgent);
    }
View Full Code Here

          System.err.println("Cannot load crawler configuration! " + e.toString());
        } catch (final IOException e) {
          System.err.println("Cannot load crawler configuration! " + e.toString());
        }

        this.robot = new RobotExclusion(this.userAgent);

        if (this.robotsFile != null && this.robotsDomain != null) {
            log.debug(this.robotsFile + " " + this.robotsDomain);
            this.robot.addLocalEntries(this.robotsDomain, new File(this.robotsFile));
        }
View Full Code Here

     */
    public IterativeHTMLCrawler(String url_list_file, String html_dump_directory, String userAgent) {
        this.url_list_file = url_list_file;
        this.html_dump_directory = html_dump_directory;

        robot = new RobotExclusion(userAgent);
    }
View Full Code Here

     */
    public IterativeHTMLCrawler(String url_list_file, String html_dump_directory, String userAgent) {
        this.url_list_file = url_list_file;
        this.html_dump_directory = html_dump_directory;

        robot = new RobotExclusion(userAgent);
    }
View Full Code Here

     */
    public IterativeHTMLCrawler(String url_list_file, String html_dump_directory, String userAgent) {
        this.url_list_file = url_list_file;
        this.html_dump_directory = html_dump_directory;

        robot = new RobotExclusion(userAgent);
    }
View Full Code Here


        this.url_list_file = ce.resolvePath(ce.getURIList());
        this.html_dump_directory = ce.resolvePath(ce.getHTDocsDumpDir());

        robot = new RobotExclusion(ce.getUserAgent());

        String robots_file = ce.getRobotsFile();
        String robots_domain = ce.getRobotsDomain();
        if (robots_file != null && robots_domain != null) {
            log.debug(robots_file + " " + robots_domain);
View Full Code Here

TOP

Related Classes of websphinx.RobotExclusion

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.