Package com.ontometrics.scraper.extraction

Source Code of com.ontometrics.scraper.extraction.SpecifiedFieldExtractor$Builder

package com.ontometrics.scraper.extraction;

import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import net.htmlparser.jericho.Element;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.ontometrics.scraper.PairedTags;
import com.ontometrics.scraper.TagOccurrence;
import com.ontometrics.scraper.util.ScraperUtil;

/**
* This is the simplest form of an extractor: we just tell the scraper how to
* identify the fields we want. There is no default behavior, no attempt to
* extract from tables, or paired tags. Each field must be given as a designated
* field and the scraper will find it in the page and extract its value.
*
* @author Rob
*/
public class SpecifiedFieldExtractor extends BaseExtractor implements FieldExtractor<SpecifiedFieldExtractor> {

  private Logger log = LoggerFactory.getLogger(SpecifiedFieldExtractor.class);

  private Set<DesignatedField> fieldsToGet;

  public static class Builder {

    private ElementIdentifierType defaultElementIdentifier;
    private Set<DesignatedField> fieldsToGet = new HashSet<DesignatedField>();

    public Builder identifyFieldsByClass() {
      this.defaultElementIdentifier = ElementIdentifierType.cssClass;
      return this;
    }

    public Builder addField(String identifier) {
      this.fieldsToGet.add(new DesignatedField(identifier, new TagOccurrence.Builder()
          .elementIdentifierType(defaultElementIdentifier)
            .identifier(identifier)
            .build()));
      return this;
    }

    public Builder addFields(Collection<String> fields) {
      for (String fieldName : fields) {
        addField(fieldName);
      }
      return this;
    }

    public SpecifiedFieldExtractor build() {
      return new SpecifiedFieldExtractor(this);
    }
  }

  @Override
  public SpecifiedFieldExtractor url(URL url) {
    super.url(url);
    return this;
  }

  @Override
  public SpecifiedFieldExtractor section(HtmlExtractor htmlExtractor) {
    super.section(htmlExtractor);
    return this;
  }

  @Override
  public SpecifiedFieldExtractor source(HtmlExtractor htmlExtractor) {
    super.source(htmlExtractor);
    return this;
  }

  public SpecifiedFieldExtractor(Builder builder) {
    this.fieldsToGet = builder.fieldsToGet;
  }

  public Set<DesignatedField> getFieldsToGet() {
    return fieldsToGet;
  }

  @Override
  public List<Field> getFields() {
    List<Field> extractedFields = new ArrayList<Field>();

    for (DesignatedField designatedField : this.fieldsToGet) {
      Element elementWithValue = ScraperUtil.extract(getSource(), designatedField.getTagToGetValueFrom());
      String value = elementWithValue.getTextExtractor().toString();
      log.debug("looking for field: {}, value: {}", designatedField.getLabel(), value);
      extractedFields.add(new ScrapedField(designatedField.getLabel(), value));
    }

    return extractedFields;
  }

  @Override
  public SpecifiedFieldExtractor field(String label, String element) {
    // TODO Auto-generated method stub
    return null;
  }

  @Override
  public SpecifiedFieldExtractor add(PairedTags pairedTags) {
    // TODO Auto-generated method stub
    return null;
  }

}
TOP

Related Classes of com.ontometrics.scraper.extraction.SpecifiedFieldExtractor$Builder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.