Examples of DocumentMetadata

de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData
lucandra.serializers.thrift.DocumentMetadata
org.apache.ctakes.preprocessor.DocumentMetaData
@author Mayo Clinic
org.exist.dom.DocumentMetadata
org.pentaho.reporting.libraries.docbundle.DocumentMetaData
Provides access to the document's bundle meta-data information.
This class unifies the information from '/mimetype', '/META-INF/manifest.xml' and '/metadata.xml'. If the manifest contains a mime-type declaration for an entry, that mime-type is reported by the repository methods.
The manifest file must follow the specification as outlined in the OpenDocument File format section 17.7. Encryption is not yet supported but may be added later. @author Thomas Morgner

Examples of lucandra.serializers.thrift.DocumentMetadata

                return; // this docId is missing
            }


           


            DocumentMetadata allTerms = IndexWriter.fromBytesUsingThrift(rows.get(0).cf.getColumn(
                    CassandraUtils.documentMetaFieldBytes).value());


            List<ReadCommand> readCommands = new ArrayList<ReadCommand>();


            for (ThriftTerm t : allTerms.getTerms())
            {
                // skip the ones not of this field
                if (!t.getField().equals(field))
                    continue;

View Full Code Here

Examples of lucandra.serializers.thrift.DocumentMetadata

                        {
                            logger.warn("Filtering out __META__ key");
                            continue;
                        }


                        DocumentMetadata dm = lucandra.IndexWriter.fromBytesUsingThrift(col.value());
                                       
                        for(ThriftTerm term : dm.getTerms())
                        {
                            Fieldable f = null; 
                            
                            if( term.isSetLongVal() )
                            {

View Full Code Here

Examples of lucandra.serializers.thrift.DocumentMetadata


        byte[] indexNameBytes = indexName.getBytes("UTF-8");
        ByteBuffer indexTermsKey = CassandraUtils.hashKeyBytes(indexNameBytes, CassandraUtils.delimeterBytes,
                "terms".getBytes("UTF-8"));


        DocumentMetadata allIndexedTerms = new DocumentMetadata();
        Map<String, DocumentMetadata> fieldCache = new HashMap<String, DocumentMetadata>(1024);


        // By default we don't handle indexSharding
        // We round robin replace the index
        docNumber = docNumber % CassandraIndexManager.maxDocsPerShard;


        ByteBuffer docId = ByteBuffer.wrap(CassandraUtils.writeVInt(docNumber));
        int position = 0;


        for (Fieldable field : doc.getFields())
        {


            ThriftTerm firstTerm = null;


            // Indexed field
            if (field.isIndexed() && field.isTokenized())
            {
                TokenStream tokens = field.tokenStreamValue();


                if (tokens == null)
                {
                    Reader tokReader = field.readerValue();


                    if (tokReader == null)
                        tokReader = new StringReader(field.stringValue());


                    tokens = analyzer.reusableTokenStream(field.name(), tokReader);
                }


                // collect term information per field
                Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new HashMap<Term, Map<ByteBuffer, List<Number>>>();


                int lastOffset = 0;
                if (position > 0)
                {
                    position += analyzer.getPositionIncrementGap(field.name());
                }


                // Build the termPositions vector for all terms


                tokens.reset(); // reset the TokenStream to the first token


                // set up token attributes we are working on


                // offsets
                OffsetAttribute offsetAttribute = null;
                if (field.isStoreOffsetWithTermVector())
                    offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);


                // positions
                // these are always gathered in later lucene versions
                PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) tokens
                            .addAttribute(PositionIncrementAttribute.class);


                // term as string
                CharTermAttribute termAttribute = (CharTermAttribute) tokens.addAttribute(CharTermAttribute.class);


                // store normalizations of field per term per document rather
                // than per field.
                // this adds more to write but less to read on other side
                Integer tokensInField = new Integer(0);


                while (tokens.incrementToken())
                {
                    tokensInField++;
                    Term term = new Term(field.name(), termAttribute.toString());


                    ThriftTerm tterm = new ThriftTerm(term.field()).setText(
                            ByteBuffer.wrap(term.text().getBytes("UTF-8"))).setIs_binary(false);


                    if (firstTerm == null)
                        firstTerm = tterm;


                    allIndexedTerms.addToTerms(tterm);


                    // fetch all collected information for this term
                    Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);


                    if (termInfo == null)
                    {
                        termInfo = new HashMap<ByteBuffer, List<Number>>();
                        allTermInformation.put(term, termInfo);
                    }


                    // term frequency
                    {
                        List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKeyBytes);


                        if (termFrequency == null)
                        {
                            termFrequency = new ArrayList<Number>();
                            termFrequency.add(new Integer(0));
                            termInfo.put(CassandraUtils.termFrequencyKeyBytes, termFrequency);
                        }


                        // increment
                        termFrequency.set(0, termFrequency.get(0).intValue() + 1);
                    }


                    // position vector
                    {
                        position += (posIncrAttribute.getPositionIncrement() - 1);


                        List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKeyBytes);


                        if (positionVector == null)
                        {
                            positionVector = new ArrayList<Number>();
                            termInfo.put(CassandraUtils.positionVectorKeyBytes, positionVector);
                        }


                        positionVector.add(++position);
                    }


                    // term offsets
                    if (field.isStoreOffsetWithTermVector())
                    {


                        List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKeyBytes);
                        if (offsetVector == null)
                        {
                            offsetVector = new ArrayList<Number>();
                            termInfo.put(CassandraUtils.offsetVectorKeyBytes, offsetVector);
                        }


                        offsetVector.add(lastOffset + offsetAttribute.startOffset());
                        offsetVector.add(lastOffset + offsetAttribute.endOffset());


                    }
                }


                List<Number> bnorm = null;
                if (!field.getOmitNorms())
                {
                    bnorm = new ArrayList<Number>();


                    final FieldInvertState invertState = new FieldInvertState();
                    invertState.setBoost(doc.getBoost() * field.getBoost());
                    invertState.setLength(tokensInField);
                    final float norm = similarity.computeNorm(field.name(), invertState);


                    bnorm.add(Similarity.getDefault().encodeNormValue(norm));
                }


                for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet())
                {


                    // Terms are stored within a unique key combination
                    // This is required since cassandra loads all columns
                    // in a key/column family into memory
                    ByteBuffer key = CassandraUtils.hashKeyBytes(indexNameBytes, CassandraUtils.delimeterBytes, term
                            .getKey().field().getBytes("UTF-8"), CassandraUtils.delimeterBytes, term.getKey().text()
                            .getBytes("UTF-8"));


                    // Mix in the norm for this field alongside each term
                    // more writes but faster on read side.
                    if (!field.getOmitNorms())
                    {
                        term.getValue().put(CassandraUtils.normsKeyBytes, bnorm);
                    }


                    CassandraUtils.addMutations(workingMutations, CassandraUtils.termVecColumnFamily, docId, key,
                            new LucandraTermInfo(docNumber, term.getValue()).serialize());


                    // Store all terms under a row
                    CassandraUtils.addMutations(workingMutations, CassandraUtils.metaInfoColumnFamily,
                            CassandraUtils.createColumnName(term.getKey()), indexTermsKey,
                            ByteBufferUtil.EMPTY_BYTE_BUFFER);
                }
            }


            // Untokenized fields go in without a termPosition
            if (field.isIndexed() && !field.isTokenized())
            {
                ThriftTerm tterm = new ThriftTerm(field.name()).setText(
                        ByteBuffer.wrap(field.stringValue().getBytes("UTF-8"))).setIs_binary(false);


                if (firstTerm == null)
                    firstTerm = tterm;


                allIndexedTerms.addToTerms(tterm);


                ByteBuffer key = CassandraUtils.hashKeyBytes(indexName.getBytes("UTF-8"),
                        CassandraUtils.delimeterBytes, field.name().getBytes("UTF-8"), CassandraUtils.delimeterBytes,
                        field.stringValue().getBytes("UTF-8"));


                
                CassandraUtils.addMutations(workingMutations, CassandraUtils.termVecColumnFamily, docId, key,
                        new LucandraTermInfo(docNumber, emptyTermMap).serialize());


                // Store all terms under a row
                CassandraUtils.addMutations(workingMutations, CassandraUtils.metaInfoColumnFamily,
                        CassandraUtils.createColumnName(field), indexTermsKey, ByteBufferUtil.EMPTY_BYTE_BUFFER);
            }


            // Stores each field as a column under this doc key
            if (field.isStored())
            {
                ThriftTerm tt = new ThriftTerm(field.name());


                if (field instanceof NumericField)
                {
                    Number n = ((NumericField) field).getNumericValue();
                    switch(((NumericField) field).getDataType())
                    {
                    case LONG: tt.setLongVal(n.longValue()); break;
                    case INT: tt.setIntVal(n.intValue()); break;
                    case FLOAT: tt.setFloatVal(n.floatValue()); break;
                    case DOUBLE: tt.setDoubleVal(n.doubleValue()); break;
                    default: throw new IllegalStateException("Unknown numeric type in field: "+field);
                    };
                }


                byte[] value = field.isBinary() ? field.getBinaryValue() : field.stringValue().getBytes("UTF-8");
                tt.setText(ByteBuffer.wrap(value)).setIs_binary(field.isBinary());


                // logic to handle multiple fields w/ same name
                DocumentMetadata currentValue = fieldCache.get(field.name());
                if (currentValue == null)
                {
                    currentValue = new DocumentMetadata();
                    fieldCache.put(field.name(), currentValue);
                }


                currentValue.addToTerms(tt);
            }


            // Store for field cache
            if (firstTerm != null)
            {

View Full Code Here

Examples of lucandra.serializers.thrift.DocumentMetadata


        IColumn metaCol = rows.get(0).cf.getColumn(CassandraUtils.documentMetaFieldBytes);
        if (metaCol == null)
            return;


        DocumentMetadata terms = fromBytesUsingThrift(metaCol.value());


        Set<String> fields = new HashSet<String>();


        for (ThriftTerm term : terms.getTerms())
        {
            // remove from field cache
            if (!fields.contains(term.getField()))
            {
                ByteBuffer fieldCacheKey = CassandraUtils.hashKeyBytes(indexNameBytes, CassandraUtils.delimeterBytes,

View Full Code Here

Examples of lucandra.serializers.thrift.DocumentMetadata

    }


    /** Read the object from bytes string. */
    public static DocumentMetadata fromBytesUsingThrift(ByteBuffer data) throws IOException
    {
        DocumentMetadata docMeta = new DocumentMetadata();


        byte[] decompressedData = CassandraUtils.decompress(ByteBufferUtil.getArray(data));


        TTransport trans = new TMemoryInputTransport(decompressedData);
        TProtocol deser = protocolFactory.getProtocol(trans);


        try
        {
            docMeta.read(deser);
        }
        catch (TException e)
        {
            throw new IOException(e);
        }

View Full Code Here

Examples of org.apache.ctakes.preprocessor.DocumentMetaData

     */
    public void testProcess()
    {
        try
        {
            DocumentMetaData dmd = iv_cnotePreProcessor.process(iv_cnoteXML);


            // validate document properties
            String docID = "000000000";
            String serviceCode = "MNT";
            Map docProperties = dmd.getMetaData();
            String cnote_docID =
                (String) docProperties.get(
                    ClinicalNotePreProcessor.MD_KEY_DOC_ID);
            String cnote_serviceCode =
                (String) docProperties.get(

View Full Code Here

Examples of org.apache.ctakes.preprocessor.DocumentMetaData

  public void process(JCas jcas) throws AnalysisEngineProcessException {


      logger.info(" process(JCas)");
    
    String originalText = null;
      DocumentMetaData dmd;


        try {
            
          JCas originalView = jcas.getView("_InitialView");
          originalText = originalView.getSofaDataString();


            PreProcessor pp = new ClinicalNotePreProcessor(
                    dtdFile,
                    includeSectionMarkers.booleanValue());
            dmd = pp.process(originalText);


            String text = dmd.getText();
            StringBuffer sb = new StringBuffer(text);


            applyTextModifier(text, sb); 
            
            // Create a view (and its Sofa) to hold the plain text version of
            // the CDA document
            JCas plaintextView = jcas.createView("plaintext");           
            plaintextView.setDocumentText(sb.toString());
            
            // Add section (segment) annotations
            Iterator<String> segmentItr = (Iterator<String>)dmd.getSegmentIdentifiers().iterator();
            while (segmentItr.hasNext()) 
            {
                String segmentID = (String) segmentItr.next();
                SegmentMetaData smd = dmd.getSegment(segmentID);


                Segment sa = new Segment(plaintextView);
                sa.setBegin(smd.span.start);
                sa.setEnd(smd.span.end);
                sa.setId(smd.id);


                sa.addToIndexes();
            }
            
            // Store meta data about the document
            Pairs propAnnot = new Pairs(plaintextView); 
            Map metaDataMap = dmd.getMetaData();
            
            String docID = (String)metaDataMap.get(ClinicalNotePreProcessor.MD_KEY_DOC_ID);
          if (docID!=null) {
              DocumentID newDocId = new DocumentID(plaintextView);
              newDocId.setDocumentID(docID);

View Full Code Here

Examples of org.apache.ctakes.preprocessor.DocumentMetaData

            }
            
            PreProcessor pp = new ClinicalNotePreProcessor(dtdFile, false);


            timestamp = System.currentTimeMillis();
            DocumentMetaData dmd = pp.process(hl7Text);
            elapsedTime = System.currentTimeMillis() - timestamp;
            System.out.println("PreProcessor Took " + elapsedTime + "ms");


            System.out.println("Plain Text Start");
            System.out.println(dmd.getText());
            System.out.println("Plain Text End");


            sectionNames = dmd.getSegmentIdentifiers();
            snItr = sectionNames.iterator();
            while (snItr.hasNext())
            {
                String sectionId = (String) snItr.next();
                SegmentMetaData smd = dmd.getSegment(sectionId);
                System.out.println("SECTION="
                        + sectionId
                        + "\tSTART_OFFSET="
                        + smd.span.start
                        + "\tEND_OFFSET="
                        + smd.span.end);
                //System.out.println(dmd.getText().substring(smd.span.start, smd.span.end));
            }


            Map metaDataMap = dmd.getMetaData();
            Iterator keyItr = metaDataMap.keySet().iterator();
            while (keyItr.hasNext())
            {
                Object key = keyItr.next();
                Object value = metaDataMap.get(key);

View Full Code Here

Examples of org.exist.dom.DocumentMetadata

                    w.flush();
                    w.close();
                    
                    is = new FileInputStream(tempFile);
                    
                    final DocumentMetadata meta = doc.getMetadata();
                    
                    final Date created = new Date(meta.getCreated());
                    final Date lastModified = new Date(meta.getLastModified());
    
                    BinaryDocument binary = destination.validateBinaryResource(txn, broker, newName, is, mimeType.getName(), -1, created, lastModified);
                    
                    binary = destination.addBinaryResource(txn, broker, binary, is, mimeType.getName(), -1, created, lastModified);

View Full Code Here

Examples of org.exist.dom.DocumentMetadata

          // store as xml resource
          
          final IndexInfo info = currentCollection.validateXMLResource(txn, broker, docUri, is);
          
          resource = info.getDocument();
          final DocumentMetadata meta = resource.getMetadata();
          meta.setMimeType(mimetype);
          meta.setCreated(date_created.getTime());
          meta.setLastModified(date_modified.getTime());
          
                  if((publicid != null) || (systemid != null)) {
                    final DocumentType docType = new DocumentTypeImpl(namedoctype, publicid, systemid);
                    meta.setDocType(docType);
                  }


          rh.startDocumentRestore(resource, atts);


          currentCollection.store(txn, broker, info, is, false);

View Full Code Here

0 1 2 3 4

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.