Package org.apache.nutch.protocol

Examples of org.apache.nutch.protocol.Content


        try {
            Configuration conf = NutchConfiguration.create();

      byte[] raw = getRawBytes(new File(file));
            Metadata meta = new Metadata();
            Content content = new Content(file, file, raw, "application/pdf", meta, conf);

      //Protocol protocol = new ProtocolFactory(conf).getProtocol(file);
      //Content content = protocol.getProtocolOutput(new Text(file), new CrawlDatum()).getContent();
      //Parse parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);
View Full Code Here


   */
  public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
View Full Code Here

        try {
            Configuration conf = NutchConfiguration.create();

      byte[] raw = getRawBytes(new File(file));
            Metadata meta = new Metadata();
            Content content = new Content(file, file, raw, "application/pdf", meta, conf);

      //Protocol protocol = new ProtocolFactory(conf).getProtocol(file);
      //Content content = protocol.getProtocolOutput(new Text(file), new CrawlDatum()).getContent();
      //Parse parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);
View Full Code Here

        try {
            Configuration conf = NutchConfiguration.create();

            byte[] raw = getRawBytes(new File(file));
            Metadata meta = new Metadata();
            Content content = new Content(file, file, raw, "trec/plain", meta, conf);

            //Protocol protocol = new ProtocolFactory(conf).getProtocol(file);                                                                                                                                                             
            //Content content = protocol.getProtocolOutput(new Text(file), new CrawlDatum()).getContent();                                                                                                                                 
            //Parse parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);                                                                                                                                                  
View Full Code Here

      DataInputStream in = new DataInputStream(new FileInputStream(file));
     
      try
      {
        in.readFully(bytes);
        Parse parse = parser.getParse(new Content(url, url, bytes,
          "application/pdf", new Metadata(), conf));
        System.out.println(parse.getData().getTitle());
      }
      finally
      {
View Full Code Here

        try {
            Configuration conf = NutchConfiguration.create();

            byte[] raw = getRawBytes(new File(file));
            Metadata meta = new Metadata();
            Content content = new Content(file, file, raw, "trec/plain", meta, conf);

            //Protocol protocol = new ProtocolFactory(conf).getProtocol(file);                                                                                                                                                             
            //Content content = protocol.getProtocolOutput(new Text(file), new CrawlDatum()).getContent();                                                                                                                                 
            //Parse parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);                                                                                                                                                  
View Full Code Here

        try {
            Configuration conf = NutchConfiguration.create();

            byte[] raw = getRawBytes(new File(file));
            Metadata meta = new Metadata();
            Content content = new Content(file, file, raw, "trec/plain", meta, conf);

            //Protocol protocol = new ProtocolFactory(conf).getProtocol(file);                                                                                                                                                             
            //Content content = protocol.getProtocolOutput(new Text(file), new CrawlDatum()).getContent();                                                                                                                                 
            //Parse parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);                                                                                                                                                  
View Full Code Here

    String file = args[0];
    byte[] raw = getRawBytes(new File(file));
       
    Metadata meta = new Metadata();
    meta.set(Response.CONTENT_LENGTH, "" + raw.length);
    Content content = new Content(file, file, raw, mime, meta,
                                  NutchConfiguration.create());

    System.out.println(parser.getParse(content).getText());
  }
View Full Code Here

   
    // Score at this stage is 1.0f.
    metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore()));

    final long startTime = System.currentTimeMillis();
    final Content content = new Content(url, url, contentBytes, mimetype,
      metaData, getConf());
    datum.setFetchTime(Nutchwax.getDate(arcData.getDate()));

    MapWritable mw = datum.getMetaData();
   
View Full Code Here

    // first disable auto detection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);

    Metadata metadata = new Metadata();
    EncodingDetector detector;
    Content content;
    String encoding;

    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    encoding = detector.guessEncoding(content, "windows-1252");
    // no information is available, so it should return default encoding
    Assert.assertEquals("windows-1252", encoding.toLowerCase());

    metadata.clear();
    metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    encoding = detector.guessEncoding(content, "windows-1252");
    Assert.assertEquals("utf-16", encoding.toLowerCase());

    metadata.clear();
    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    detector.addClue("windows-1254", "sniffed");
    encoding = detector.guessEncoding(content, "windows-1252");
    Assert.assertEquals("windows-1254", encoding.toLowerCase());

    // enable autodetection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
    metadata.clear();
    metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    detector.addClue("utf-32", "sniffed");
    encoding = detector.guessEncoding(content, "windows-1252");
View Full Code Here

TOP

Related Classes of org.apache.nutch.protocol.Content

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.