Package net.sf.regain.crawler.preparator.html

Examples of net.sf.regain.crawler.preparator.html.HtmlPathExtractor


      String pathEndRegex = (String) sectionArr[i].get("endRegex");
      String pathNodeRegex = (String) sectionArr[i].get("pathNodeRegex");
      int pathNodeUrlGroup = getIntParam(sectionArr[i], "pathNodeRegex.urlGroup");
      int pathNodeTitleGroup = getIntParam(sectionArr[i], "pathNodeRegex.titleGroup");

      mPathExtractorArr[i] = new HtmlPathExtractor(prefix, pathStartRegex,
        pathEndRegex, pathNodeRegex, pathNodeUrlGroup,
        pathNodeTitleGroup);
    }
  }
View Full Code Here


      // Set the headlines
      setHeadlines(headlines);
    }

    // Find the path extractor that is responsible for this document
    HtmlPathExtractor pathExtractor = null;
    if (mPathExtractorArr != null) {
      for (int i = 0; i < mPathExtractorArr.length; i++) {
        if (mPathExtractorArr[i].accepts(rawDocument)) {
          pathExtractor = mPathExtractorArr[i];
        }
      }
    }

    // Extract the path from the document
    if (pathExtractor != null) {
      PathElement[] path = pathExtractor.extractPath(rawDocument);
      setPath(path);
    }
  }
View Full Code Here

TOP

Related Classes of net.sf.regain.crawler.preparator.html.HtmlPathExtractor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.