Examples of ParseImpl


Examples of net.nutch.parse.ParseImpl

    // collect outlink
    Outlink[] outlinks = new Outlink[0];

    ParseData parseData = new ParseData(title, outlinks, metadata);
    return new ParseImpl(text, parseData);
    // any filter?
    //return HtmlParseFilters.filter(content, parse, root);
  }
View Full Code Here

Examples of net.nutch.parse.ParseImpl

    // collect meta data
    Properties metadata = new Properties();
    metadata.putAll(content.getMetadata()); // copy through

    ParseData parseData = new ParseData(title, outlinks, metadata);
    return new ParseImpl(text, parseData);
    // any filter?
    //return HtmlParseFilters.filter(content, parse, root);
  }
View Full Code Here

Examples of net.nutch.parse.ParseImpl

    // collect meta data
    Properties metaData = new Properties();
    metaData.putAll(content.getMetadata()); // copy through

    ParseData parseData = new ParseData(title, outlinks, metaData);
    return new ParseImpl(text, parseData);
  }
View Full Code Here

Examples of org.apache.droids.parse.ParseImpl

    this.mimeType = mimeType;
  }

  @Override
  public Parse getParse() {
    return new ParseImpl(text, outlinks);
  }
View Full Code Here

Examples of org.apache.droids.parse.ParseImpl

    } catch (SAXException ex) {
      throw new ContentFormatViolationException("Failure parsing HTML content", ex);
    } finally {
      instream.close();
    }
    return new ParseImpl(newLink.getId(),linkExtractor.getLinks());
  }
View Full Code Here

Examples of org.apache.droids.parse.ParseImpl

    BodyContentHandler handler = new BodyContentHandler();
   
    InputStream instream = entity.obtainContent();
    try {
      parser.parse(instream, handler, metadata, new ParseContext());
      ParseImpl parse = new ParseImpl(handler.toString(),null);
     
      return parse;

    } catch (SAXException ex) {
      throw new DroidsException("Failure parsing document " + link.getId(), ex);
View Full Code Here

Examples of org.apache.droids.parse.ParseImpl

    InputStream instream = entity.obtainContent();
    try {
      parser.parse(instream, parallelHandler, metadata);
     
      return new ParseImpl(data.toString(), extractor.getLinks());
    } catch (SAXException ex) {
      throw new DroidsException("Failure parsing document " + link.getId(), ex);
    } catch (TikaException ex) {
      throw new DroidsException("Failure parsing document " + link.getId(), ex);
    } finally {
View Full Code Here

Examples of org.apache.nutch.parse.ParseImpl

        }
      }

      // get parse metadata
      Metadata metadata = parseData.getContentMeta();
      Parse parse = new ParseImpl(parseText, parseData);

      // handle redirect urls
      Text reprUrlText = (Text)fetchDatum.getMetaData().get(
        Nutch.WRITABLE_REPR_URL_KEY);
      String reprUrl = reprUrlText != null ? reprUrlText.toString() : null;
      String url = key.toString();
      String fieldUrl = (reprUrl != null) ? reprUrl : url;
      String host = URLUtil.getHost(fieldUrl);

      // add segment, used to map from merged index back to segment files
      FieldWritable segField = new FieldWritable(Fields.SEGMENT,
        metadata.get(Nutch.SEGMENT_NAME_KEY), FieldType.CONTENT, false, true,
        false);
      fieldsList.add(segField);

      // add digest, used by dedup
      FieldWritable digestField = new FieldWritable(Fields.DIGEST,
        metadata.get(Nutch.SIGNATURE_KEY), FieldType.CONTENT, false, true,
        false);
      fieldsList.add(digestField);

      // url is both stored and indexed, so it's both searchable and returned
      fieldsList.add(new FieldWritable(Fields.URL, fieldUrl, FieldType.CONTENT,
        true, true, true));
      fieldsList.add(new FieldWritable(Fields.SEG_URL, url, FieldType.CONTENT,
        false, true, false));

      if (reprUrl != null) {
        // also store original url as both stored and indexes
        fieldsList.add(new FieldWritable(Fields.ORIG_URL, url,
          FieldType.CONTENT, true, true, true));
      }

      if (host != null) {
        // add host as un-stored, indexed and tokenized
        FieldWritable hostField = new FieldWritable(Fields.HOST, host,
          FieldType.CONTENT, true, false, true);
        fieldsList.add(hostField);

        // add site as un-stored, indexed and un-tokenized
        FieldWritable siteField = new FieldWritable(Fields.SITE, host,
          FieldType.CONTENT, true, false, false);
        fieldsList.add(siteField);
      }

      // content is indexed, so that it's searchable, but not stored in index
      fieldsList.add(new FieldWritable(Fields.CONTENT, parse.getText(),
        FieldType.CONTENT, true, false, true));

      // title
      String title = parse.getData().getTitle();
      if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
        title = title.substring(0, MAX_TITLE_LENGTH);
      }
      // add title indexed and stored so that it can be displayed
      fieldsList.add(new FieldWritable(Fields.TITLE, title, FieldType.CONTENT,
        true, true, true));

      // add cached content/summary display policy, if available
      String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
      if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
        fieldsList.add(new FieldWritable(Fields.CACHE, caching,
          FieldType.CONTENT, false, true, false));
      }
View Full Code Here

Examples of org.apache.nutch.parse.ParseImpl

    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());

    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata());
    return ParseResult.createParseResult(content.getUrl(),
                                         new ParseImpl(text, parseData));
  }
View Full Code Here

Examples of org.apache.nutch.parse.ParseImpl

    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());

    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata(),
                                        metadata);
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // any filter?
    //return HtmlParseFilters.filter(content, parse, root);
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.