Examples of fullSequentialParse()


Examples of net.htmlparser.jericho.Source.fullSequentialParse()

   * @throws IOException
   */
  public String execute() throws IOException {
    String result = "";
    Source source = new Source(url);
    source.fullSequentialParse();
    // log.debug("parsed source: {}", source.toString());

    if (idToGet != null) {
      result = source.getElementById(idToGet).getTextExtractor().toString();
    } else if (classToGet != null) {
View Full Code Here

Examples of net.htmlparser.jericho.Source.fullSequentialParse()

   */
  public List<String> getResults() throws IOException {
    log.debug("extracting results from url: {}", url);
    List<String> results = new ArrayList<String>();
    Source source = new Source(url);
    source.fullSequentialParse();
    String content = source.toString();
    List<Element> currentElements = null;
    for (TagOccurrence toGet : tagsToGet) {
      log.debug("toGet = {}", toGet);
      if (toGet.getOccurrence() > 0) {
View Full Code Here

Examples of net.htmlparser.jericho.Source.fullSequentialParse()

    for (int i = 0; i < afterTagOccurrence.getOccurrence(); i++) {
      sourceHtml = sourceHtml.substring(sourceHtml.indexOf(endAfterTag) + 1);
    }
    String afterSource = sourceHtml;
    Source newSource = new Source(afterSource);
    newSource.fullSequentialParse();
    return newSource;
  }

  public Extractor asText() {
    this.outputFormat = OutputFormats.Text;
View Full Code Here

Examples of net.htmlparser.jericho.Source.fullSequentialParse()

      extractedFields.addAll(defaultFieldExtractions());
    }

    if (this.classToGet != null) {
      Source source = new Source(url);
      source.fullSequentialParse();
      List<Element> elements = source.getAllElementsByClass(classToGet);
      String text = elements.get(0).toString();
      String[] fields = text.split("<br>");
      log.debug("fields: {}", fields);
      for (String field : fields) {
View Full Code Here

Examples of net.htmlparser.jericho.Source.fullSequentialParse()

    }

    Source source = new Source(url);
    for (TagOccurrence tagOccurrence : this.tagsToGet) {
      // log.debug("extracting fields using tag: {}", tagOccurrence);
      source.fullSequentialParse();
      if (!(tagOccurrence.getTag().contains(HTMLElementName.TABLE) || tagOccurrence.getTag().contains(
          HTMLElementName.A))) {
        throw new IllegalStateException(MessageFormat.format(
            "Asked to extract tag: {0}, only know how to extract fields from tables.",
            tagOccurrence.getTag()));
View Full Code Here

Examples of net.htmlparser.jericho.Source.fullSequentialParse()

        }

      }
    }
    source = new Source(url);
    source.fullSequentialParse();
    if (this.afterTagOccurrence != null) {
      source = pruneFrom(source, afterTagOccurrence);
    }
    for (FieldToGet fieldToGet : fieldsToGet) {
      String value = "";
View Full Code Here

Examples of net.htmlparser.jericho.Source.fullSequentialParse()

  private List<Field> extractLinksFromList(String html) {
    log.debug("extracting links...");
    List<Field> fields = new ArrayList<Field>();
    Source source = new Source(html);
    source.fullSequentialParse();
    List<Element> links = source.getAllElements(HTMLElementName.A);
    for (Element a : links) {
      String label = a.getTextExtractor().toString();
      String href = a.getAttributeValue("href");
      if (matchingPattern == null || href.contains(matchingPattern)) {
View Full Code Here

Examples of net.htmlparser.jericho.Source.fullSequentialParse()

    return this;
  }

  private boolean fieldHasMultipleValues(String fieldValue) {
    Source source = new Source(fieldValue);
    source.fullSequentialParse();
    return source.getAllElements(HTMLElementName.BR).size() > 1;
  }

  private String delimitFieldValues(String source) {
    Source result = new Source(source.replace("<br>", ";").replace("<br/>", ";"));
View Full Code Here

Examples of net.htmlparser.jericho.Source.fullSequentialParse()

    }
  }

  public static List<Link> extractLinks(String sourceToParse) {
    Source source = new Source(sourceToParse);
    source.fullSequentialParse();
    List<Link> links = new ArrayList<Link>();
    List<Element> as = source.getAllElements(HTMLElementName.A);
    for (Element linkElement : as) {
      links.add(new Link(linkElement.getTextExtractor().toString(), linkElement.getAttributeValue("href")));
    }
View Full Code Here

Examples of net.htmlparser.jericho.Source.fullSequentialParse()

  }

  public static String extractUsingIdentifier(String html, TagOccurrence tagOccurrence) {
    String result = null;
    Source source = new Source(html);
    source.fullSequentialParse();
    if (tagOccurrence.getElementIdentifierType() == ElementIdentifierType.ID) {
      log.debug("extracting tag by id: {}", tagOccurrence.getIdentifier());
      Element idElement = source.getElementById(tagOccurrence.getIdentifier());
      if (idElement != null) {
        result = idElement.toString();
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.