Package org.apache.nutch.indexer

Examples of org.apache.nutch.indexer.IndexingFilters


   
    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }
       
    IndexingFilters indexers = new IndexingFilters(conf);
   
    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    CrawlDatum datum = new CrawlDatum();
   
    Content content = protocol.getProtocolOutput(new Text(url), datum)
        .getContent();

    // store the guessed content type in the crawldatum
    if (content.getContentType() != null) datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));

    if (content == null) {
      System.out.println("No content for " + url);
      return 0;
    }
   
    contentType = content.getContentType();
   
    if (contentType == null) {
      return -1;
    }
   
    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
    }

    ParseResult parseResult = new ParseUtil(conf).parse(content);
   
    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);

    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }

    for (String fname : doc.getFieldNames()) {
View Full Code Here


    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }

    IndexingFilters indexers = new IndexingFilters(conf);

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    CrawlDatum datum = new CrawlDatum();

    ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
   
    if (!output.getStatus().isSuccess()) {
      System.out.println("Fetch failed with protocol status: " + output.getStatus());
      return 0;
    }
        
    Content content = output.getContent();

    if (content == null) {
      System.out.println("No content for " + url);
      return 0;
    }

    contentType = content.getContentType();

    if (contentType == null) {
      return -1;
    }

    // store the guessed content type in the crawldatum
    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType));

    if (ParseSegment.isTruncated(content)) {
      LOG.warn("Content is truncated, parse may fail!");
    }

    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
    }

    ParseResult parseResult = new ParseUtil(conf).parse(content);

    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);

    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }

    if (doc == null) {
View Full Code Here

   
    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }
       
    IndexingFilters indexers = new IndexingFilters(conf);
   
    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    CrawlDatum datum = new CrawlDatum();
   
    Content content = protocol.getProtocolOutput(new Text(url), datum)
        .getContent();
   
    if (content == null) {
      System.out.println("No content for " + url);
      return 0;
    }
   
    contentType = content.getContentType();
   
    if (contentType == null) {
      return -1;
    }
   
    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
    }
   
    ParseResult parseResult = new ParseUtil(conf).parse(content);
   
    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);

    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }

    for (String fname : doc.getFieldNames()) {
View Full Code Here

    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }

    IndexingFilters indexers = new IndexingFilters(conf);

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    CrawlDatum datum = new CrawlDatum();

    Content content = protocol.getProtocolOutput(new Text(url), datum)
        .getContent();

    if (content == null) {
      System.out.println("No content for " + url);
      return 0;
    }

    contentType = content.getContentType();

    if (contentType == null) {
      return -1;
    }

    // store the guessed content type in the crawldatum
    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType));

    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
    }

    ParseResult parseResult = new ParseUtil(conf).parse(content);

    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);

    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }

    if (doc == null) {
View Full Code Here

TOP

Related Classes of org.apache.nutch.indexer.IndexingFilters

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.