Package org.apache.nutch.metadata

Examples of org.apache.nutch.metadata.Metadata


      String collectionName, String filename, URLNormalizers urlNormalizers, URLFilters filters) throws ParseException, IOException {
    Token docNumber = null;
    TRECDoc d = new TRECDoc();
    StringBuffer docBody = new StringBuffer();
    String mimetype = null;
    Metadata metaData = new Metadata();
    Date fetchDate = null;
   
    label_1:
      while (true) {
        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
        case DOC_BEGIN:
          ;
          break;
        default:
          jj_la1[0] = jj_gen;
        break label_1;
        }
        try {
          try {
          jj_consume_token(DOC_BEGIN);
          label_2:
            while (true) {
              switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
              case 23:
              case 24:
              case 25:
              case 26:
                ;
                break;
              default:
                jj_la1[1] = jj_gen;
              break label_2;
              }
              switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
              case 23:
                jj_consume_token(23);
                break;
              case 24:
                jj_consume_token(24);
                break;
              case 25:
                jj_consume_token(25);
                break;
              case 26:
                jj_consume_token(26);
                break;
              default:
                jj_la1[2] = jj_gen;
              jj_consume_token(-1);
              throw new ParseException();
              }
            }
          label_3:
            while (true) {
              switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
              case DOCNO_BEGIN:
                jj_consume_token(DOCNO_BEGIN);
                docNumber = jj_consume_token(DOCNO);
                jj_consume_token(DOCNO_END);
                label_4:
                  while (true) {
                    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
                    case 23:
                    case 24:
                    case 25:
                    case 26:
                      ;
                      break;
                    default:
                      jj_la1[3] = jj_gen;
                    break label_4;
                    }
                    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
                    case 23:
                      jj_consume_token(23);
                      break;
                    case 24:
                      jj_consume_token(24);
                      break;
                    case 25:
                      jj_consume_token(25);
                      break;
                    case 26:
                      jj_consume_token(26);
                      break;
                    default:
                      jj_la1[4] = jj_gen;
                    jj_consume_token(-1);
                    throw new ParseException();
                    }
                  }
                break;
              case OLDDOCNO_BEGIN:
                jj_consume_token(OLDDOCNO_BEGIN);
                jj_consume_token(DOCNO);
                jj_consume_token(OLDDOCNO_END);
                label_5:
                  while (true) {
                    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
                    case 23:
                    case 24:
                    case 25:
                    case 26:
                      ;
                      break;
                    default:
                      jj_la1[5] = jj_gen;
                    break label_5;
                    }
                    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
                    case 23:
                      jj_consume_token(23);
                      break;
                    case 24:
                      jj_consume_token(24);
                      break;
                    case 25:
                      jj_consume_token(25);
                      break;
                    case 26:
                      jj_consume_token(26);
                      break;
                    default:
                      jj_la1[6] = jj_gen;
                    jj_consume_token(-1);
                    throw new ParseException();
                    }
                  }
                break;
              default:
                jj_la1[7] = jj_gen;
              jj_consume_token(-1);
              throw new ParseException();
              }
              switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
              case DOCNO_BEGIN:
              case OLDDOCNO_BEGIN:
                ;
                break;
              default:
                jj_la1[8] = jj_gen;
              break label_3;
              }
            }
            jj_consume_token(DOCHDR_BEGIN);
            docHdrBody(d);
            jj_consume_token(DOCHDR_END);
            docBody = new StringBuffer(body());
            switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
            case DOC_END:
              jj_consume_token(DOC_END);
              break;
            case 0:
              jj_consume_token(0);
              break;
            default:
              jj_la1[9] = jj_gen;
            jj_consume_token(-1);
            throw new ParseException();
            }
            label_6:
              while (true) {
                switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
                case 23:
                case 24:
                case 25:
                case 26:
                  ;
                  break;
                default:
                  jj_la1[10] = jj_gen;
                break label_6;
                }
                switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
                case 23:
                  jj_consume_token(23);
                  break;
                case 24:
                  jj_consume_token(24);
                  break;
                case 25:
                  jj_consume_token(25);
                  break;
                case 26:
                  jj_consume_token(26);
                  break;
                default:
                  jj_la1[11] = jj_gen;
                jj_consume_token(-1);
                throw new ParseException();
                }
              }
          }
          catch (Error err) { // TODO MC - to catch
           LOG.error("Error:"+err.getMessage());
           throw new ParseException(err.getMessage());
          }     
        }
        catch (Exception e) {
          LOG.error("Parsing exception:"+e.toString());
          StringBuilder buffer = new StringBuilder(16000); // TODO MC
          Token t=null
         
          do {
            try {
              t = getNextToken();
              buffer.append(t.image);
            }                       
            catch (Error err) {
              LOG.error("Parsing nested error:"+err.toString());
            }
            catch (Exception enest) {
              LOG.error("Parsing nested exception:"+enest.toString());
            }
          }
          while (t.kind != DOC_END && t.kind != EOF);
          docBody.append(buffer.toString()); // TODO MC
          if (t.kind != EOF) {
            // Eat up DOC_END.
            try {
              t = getNextToken();
            }             
            catch (Error err) {
              LOG.error("Parsing nested error2:"+err.toString());
            }
            catch (Exception enest) {
              LOG.error("Parsing nested exception2:"+enest.toString());
            }
          }         
         
          // If error, skip this doc. completely.
          //{if (true) return;} TODO MC - this line ignores all other files from gz files
          LOG.error("Parsing will continue with text:"); // TODO MC
          LOG.error(""+(new String(docBody.toString().getBytes()))); // TODO MC
        }       
        d.docNumber = docNumber.image;
        d.bodyLength = new Integer(docBody.length());
             
     
        /* TODO MC - normalize and filter URL*/
        String url=d.urlString;       
        try {
            url = urlNormalizers.normalize(url,URLNormalizers.SCOPE_FETCHER);
            url = filters.filter(url); // filter the url
          }
          catch (Exception e) {
              LOG.error("Error:"+e.getMessage());
              throw new ParseException(e.getMessage());
          }
         
        LOG.info("Importing DocNo:" + d.docNumber + " url:" + url + " oldurl:" +d.urlString);
          if (url!=null) { // for instance if the url is too large then it is null                                       
        
            //Go through headers
            for (Iterator it=d.headers.entrySet().iterator(); it.hasNext(); ) {
              Map.Entry me = (Map.Entry)it.next();
              String key = (String)me.getKey();
              String value = (String)me.getValue();
              // Find a mimetype
              if (key.toLowerCase().equals(
              CONTENT_TYPE_KEY)) {
                // Is it valid
                try {
                  mimetype = value.toLowerCase().
                  replaceAll(WHITESPACE,"-");
                  if (mimetype == null) {
                    mimetype = "no-type";
                  }
                  new MimeType(value.toLowerCase());
                } catch (MimeTypeException e) {
                  mimetype = "no-type";
                }
                if (skip(mimetype)) { //XXX
                }
              }
              // Parse a fetch date from the http headers
              if (key.toLowerCase().equals(DATE_KEY)) {
                try {
                  fetchDate = parseDate(value);
                } catch (ParseException e) {
                  // Need to log this
                  // Parse exception, default date will be inserted later
                  LOG.error("Date Exception " + e.getMessage());
                }
              }
              // Add the rest of headers to the metadata
              metaData.set(key, value);
            }
           
            // Set metadata document number
            metaData.set(DOCNO_KEY,d.docNumber);
            // Set mimetype
            metaData.set(CONTENT_TYPE_KEY, mimetype);
            //Set length
            metaData.set(CONTENT_LENGTH, d.bodyLength.toString());
            // Set Segment Name
            metaData.set(Nutch.SEGMENT_NAME_KEY, segmentName);
            //Set md5
            metaData.set(Nutch.SIGNATURE_KEY, MD5Hash.digest(docBody.toString().getBytes()).toString());
            // Set collection
            metaData.set(COLLECTION,collectionName); // TODO MC
            // Set arcname
            metaData.set(ARC_NAME,filename); // TODO MC


            //If we didn't get a date Just use a random one.
            if (fetchDate == null) {
              fetchDate = new Date(Long.decode("1151693552").longValue());
            }             

            //start timer
            // long startTime = System.currentTimeMillis();

            //Make a content object
            Content content = new Content(url,url, docBody.toString().getBytes(), mimetype, metaData, conf);

            Parse parse = null;
            ParseStatus parseStatus;
            try {
              parse = pu.parse(content);
              parseStatus = parse.getData().getStatus();
            }
            catch (final Exception e) {
              parseStatus = new ParseStatus(e);
              LOG.error("error: unknown "+parseStatus.toString());
              if(!parseStatus.isSuccess()) {
                LOG.error("parse failure");
              }
            }
            catch (StackOverflowError soe){
              parseStatus = new ParseStatus(soe);
              LOG.error("error: StackOverflowError "+parseStatus.toString());
              if(!parseStatus.isSuccess()) {
                LOG.error("parse failure");
              }
            }

            if(parseStatus.isSuccess()) {
              CrawlDatum datum = new CrawlDatum();
              datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
              datum.setFetchTime(fetchDate.getTime());
         
              // Score at this stage is 1.0f.
              metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore())); // TODO MC
                               
              // WritableComparable outkey = new UTF8(d.urlString);
              WritableComparable outkey = new Text(url);
              Writable outvalue = new FetcherOutput(datum, null, new ParseImpl(parse));                
                   
              // output.collect(outkey, outvalue);
              Text key=Nutchwax.generateWaxKey(outkey, collectionName);
              output.collect(key, outvalue);                       
            }
          }
         
        d = new TRECDoc();
        metaData = new Metadata();
      }

    jj_consume_token(0);
  }
View Full Code Here


    // in memory representation of pdf file
    PDDocument pdf = null;

    String text = null;
    String title = null;
    Metadata metadata = new Metadata();

    try {

      byte[] raw = content.getContent();

      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
      }

      // TODO MC - store pdf files to analyze
      // FileOutputStream fout = new FileOutputStream("/home/nutchwax/lixo/"+System.currentTimeMillis()+".pdf");
      // fout.write(raw);
      // fout.close();
      // TODO MC

      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
     
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
     
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
View Full Code Here

        try {
            Configuration conf = NutchConfiguration.create();

      byte[] raw = getRawBytes(new File(file));
            Metadata meta = new Metadata();
            Content content = new Content(file, file, raw, "application/pdf", meta, conf);

      //Protocol protocol = new ProtocolFactory(conf).getProtocol(file);
      //Content content = protocol.getProtocolOutput(new Text(file), new CrawlDatum()).getContent();
      //Parse parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);
View Full Code Here

  // constructor is called -> conf is null. The programmer which use this object may not forget to set the conf.
  public ParseData() {}

  public ParseData(ParseStatus status, String title, Outlink[] outlinks,
                   Metadata contentMeta) {
    this(status, title, outlinks, contentMeta, new Metadata());
  }
View Full Code Here

      Outlink.skip(in);
    }
   
    if (version < 3) {
      int propertyCount = in.readInt();             // read metadata
      contentMeta = new Metadata();
      for (int i = 0; i < propertyCount; i++) {
        contentMeta.add(Text.readString(in), Text.readString(in));
      }
    } else {
      contentMeta = new Metadata();
      contentMeta.readFields(in);
    }
    if (version > 3) {
      parseMeta = new Metadata();
      parseMeta.readFields(in);
    }
  }
View Full Code Here

    inflated = true;
  }

  protected final void readFieldsCompressed(DataInput in) throws IOException {
    version = in.readByte();
    metadata = new Metadata();
    switch (version) {
    case 0:
    case 1:
      url = UTF8.readString(in); // read url 
      base = UTF8.readString(in); // read base
View Full Code Here

            return;                                     // only have inlinks
        }
    }
      
    Document doc = new Document();
    Metadata metadata = parseData.getContentMeta();
  
    if (metadata.get(Nutch.SEGMENT_NAME_KEY)==null || metadata.get(Nutch.SIGNATURE_KEY)==null) {
      LOG.error("Metadata empty:"+key+" "+parseData.toString());
      return;
    }
   
    // add segment, used to map from merged index back to segment files
    doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY),
            Field.Store.YES, Field.Index.NO));

    // add digest, used by dedup
    doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY),
            Field.Store.YES, Field.Index.NO));
        
    Parse parse = new ParseImpl(parseText, parseData);
    try {
      // run indexing filters
View Full Code Here

                      Integer.parseInt(request.getParameter("id")));

    HitDetails details = bean.getDetails(hit);
    String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo();

    Metadata metaData = bean.getParseData(details).getContentMeta();

    String content = null;
    String contentType = (String) metaData.get(Metadata.CONTENT_TYPE);


    if (contentType.startsWith("text/html")) {
      // FIXME : it's better to emit the original 'byte' sequence
      // with 'charset' set to the value of 'CharEncoding',
      // but I don't know how to emit 'byte sequence' in JSP.
      // out.getOutputStream().write(bean.getContent(details)) may work,
      // but I'm not sure.
      String encoding = (String) metaData.get("CharEncodingForConversion");
      if (encoding != null) {
        try {
          content = new String(bean.getContent(details), encoding);
        } catch (UnsupportedEncodingException e) {
          //fallback to configured charset
View Full Code Here

    // raw bytes
    byte[] bytes = getServiceLocator().getNutchBean().getContent(details);

    // pass all original headers? only these for now.
    Metadata metadata = getServiceLocator().getNutchBean()
        .getParseData(details).getContentMeta();
    String contentType = metadata.get(Response.CONTENT_TYPE);
    // String lastModified = metadata.get(Metadata.LAST_MODIFIED);
    // String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
    // better use this, since it may have been truncated during fetch
    // or give warning if they don't match?
    int contentLength = bytes.length;
View Full Code Here

     
      try
      {
        in.readFully(bytes);
        Parse parse = parser.getParse(new Content(url, url, bytes,
          "application/pdf", new Metadata(), conf));
        System.out.println(parse.getData().getTitle());
      }
      finally
      {
        if (in != null)
View Full Code Here

TOP

Related Classes of org.apache.nutch.metadata.Metadata

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.