Package net.htmlparser.jericho

Examples of net.htmlparser.jericho.Element


    List<Element> labels = source.getAllElements(HTMLElementName.DT);
    List<Element> values = source.getAllElements(HTMLElementName.DD);
    int cellCount = Math.min(labels.size(), values.size());
    for (int i = 0; i < cellCount; i++) {
      String label = labels.get(i).getTextExtractor().toString().trim().replaceAll(":$", "");
      Element valueElement = values.get(i);
      log.debug("looking at value element: {}", valueElement);
      String value = getValueFieldText(valueElement);
      extractedFields.add(new ScrapedField(label, value));
    }
    return extractedFields;
View Full Code Here


  @Override
  public String performExtraction() {
    String attributeValue = "";
    if (getSource().getAllElements().size() > 0) {
      Element targetElement = getSource().getAllElements().get(0);
      attributeValue = targetElement.getAttributeValue(attributeName);
    }
    return attributeValue;
  }
View Full Code Here

  @Override
  public String performExtraction() {
    String extractedSource = "";
    if (tagOccurrence.getIdentifier() != null) {
      log.debug("about to splice: {}", tagOccurrence);
      Element element = ScraperUtil.extract(getSource(), tagOccurrence);
      if (element != null) {
        extractedSource = element.toString();
      }
      log.debug("spliced out: {}", extractedSource);
    } else {
      extractedSource = ScraperUtil.extract(getSource().toString(), tagOccurrence.getTag(),
          tagOccurrence.getOccurrence());
View Full Code Here

    log.debug("occurrence {} at {} to {}", new Object[] { occurrence, begin, length });
    return tags[occurrence].substring(begin, length);
  }

  public static Element extract(Source source, TagOccurrence tagOccurrence) {
    Element result = null;
    if (tagOccurrence.getElementIdentifierType() == ElementIdentifierType.cssClass) {
      List<Element> elements = source.getAllElementsByClass(tagOccurrence.getIdentifier());
      if(elements != null && !elements.isEmpty())
        result = elements.get(0);
    } else if (tagOccurrence.getElementIdentifierType() == ElementIdentifierType.ID) {
View Full Code Here

    String result = null;
    Source source = new Source(html);
    source.fullSequentialParse();
    if (tagOccurrence.getElementIdentifierType() == ElementIdentifierType.ID) {
      log.debug("extracting tag by id: {}", tagOccurrence.getIdentifier());
      Element idElement = source.getElementById(tagOccurrence.getIdentifier());
      if (idElement != null) {
        result = idElement.toString();
      } else {
        result = "";
      }
    } else if (tagOccurrence.getElementIdentifierType() == ElementIdentifierType.cssClass) {
      log.debug("extracting: {}", tagOccurrence);
View Full Code Here

    List<Field> extractedFields = new ArrayList<Field>();

    for (DesignatedField designatedField : this.fieldsToGet) {
            log.debug("designated field: {}", designatedField);
            log.debug("tag to get value from: {}", designatedField.getTagToGetValueFrom());
      Element elementWithValue = ScraperUtil.extract(getSource(), designatedField.getTagToGetValueFrom());
            log.debug("element with value: {}", elementWithValue);
      String value = getValueFieldText(elementWithValue);
      log.debug("looking for field: {}, value: {}", designatedField.getLabel(), value);
      extractedFields.add(new ScrapedField(designatedField.getLabel(), value));
    }
View Full Code Here

    if (cellCount == (rowCount * 2)) {
      Field lastField = null;
      log.debug("cells.size: {}", cellCount);
      List<Element> cells = source.getAllElements(HTMLElementName.TD);
      for (int i = 0; i < cellCount; i++) {
        Element labelElement = cells.get(i);
        Element valueElement = cells.get(++i);
        String label = labelElement.getTextExtractor().toString().trim().replaceAll(":$", "");
        String value = getValueFieldText(valueElement);
        log.debug("found field: {}={}", label, value);
        if (StringUtils.isEmpty(label) && lastField != null) {
          lastField.addValue(value);
View Full Code Here

    List<Element> labels = source.getAllElements(HTMLElementName.DT);
    List<Element> values = source.getAllElements(HTMLElementName.DD);
    int cellCount = Math.min(labels.size(), values.size());
    for (int i = 0; i < cellCount; i++) {
      String label = labels.get(i).getTextExtractor().toString().trim().replaceAll(":$", "");
      Element valueElement = values.get(i);
      log.debug("looking at value element: {}", valueElement);
      String value = getValueFieldText(valueElement);
      extractedFields.add(new ScrapedField(label, value));
    }
    return extractedFields;
View Full Code Here

  }

  private void removeInvalidFields(List<Element> fields) {
    java.util.Iterator<Element> iterator = fields.iterator();
    while (iterator.hasNext()) {
      Element field = iterator.next();
      if (!isAField(field.toString())) {
        log.debug("pruning invalid field: {}", field);
        iterator.remove();
      }
    }
  }
View Full Code Here

  @Override
  public List<Field> getFields() {
    List<Field> extractedFields = new ArrayList<Field>();

    for (DesignatedField designatedField : this.fieldsToGet) {
      Element elementWithValue = ScraperUtil.extract(getSource(), designatedField.getTagToGetValueFrom());
      String value = elementWithValue.getTextExtractor().toString();
      log.debug("looking for field: {}, value: {}", designatedField.getLabel(), value);
      extractedFields.add(new ScrapedField(designatedField.getLabel(), value));
    }

    return extractedFields;
View Full Code Here

TOP

Related Classes of net.htmlparser.jericho.Element

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.