Examples of DocumentMetadata


Examples of lucandra.serializers.thrift.DocumentMetadata

                return; // this docId is missing
            }

          

            DocumentMetadata allTerms = IndexWriter.fromBytesUsingThrift(rows.get(0).cf.getColumn(
                    CassandraUtils.documentMetaFieldBytes).value());

            List<ReadCommand> readCommands = new ArrayList<ReadCommand>();

            for (ThriftTerm t : allTerms.getTerms())
            {
                // skip the ones not of this field
                if (!t.getField().equals(field))
                    continue;
View Full Code Here

Examples of lucandra.serializers.thrift.DocumentMetadata

                        {
                            logger.warn("Filtering out __META__ key");
                            continue;
                        }

                        DocumentMetadata dm = lucandra.IndexWriter.fromBytesUsingThrift(col.value());
                                      
                        for(ThriftTerm term : dm.getTerms())
                        {
                            Fieldable f = null;
                           
                            if( term.isSetLongVal() )
                            {
View Full Code Here

Examples of lucandra.serializers.thrift.DocumentMetadata

        byte[] indexNameBytes = indexName.getBytes("UTF-8");
        ByteBuffer indexTermsKey = CassandraUtils.hashKeyBytes(indexNameBytes, CassandraUtils.delimeterBytes,
                "terms".getBytes("UTF-8"));

        DocumentMetadata allIndexedTerms = new DocumentMetadata();
        Map<String, DocumentMetadata> fieldCache = new HashMap<String, DocumentMetadata>(1024);

        // By default we don't handle indexSharding
        // We round robin replace the index
        docNumber = docNumber % CassandraIndexManager.maxDocsPerShard;

        ByteBuffer docId = ByteBuffer.wrap(CassandraUtils.writeVInt(docNumber));
        int position = 0;

        for (Fieldable field : doc.getFields())
        {

            ThriftTerm firstTerm = null;

            // Indexed field
            if (field.isIndexed() && field.isTokenized())
            {
                TokenStream tokens = field.tokenStreamValue();

                if (tokens == null)
                {
                    Reader tokReader = field.readerValue();

                    if (tokReader == null)
                        tokReader = new StringReader(field.stringValue());

                    tokens = analyzer.reusableTokenStream(field.name(), tokReader);
                }

                // collect term information per field
                Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new HashMap<Term, Map<ByteBuffer, List<Number>>>();

                int lastOffset = 0;
                if (position > 0)
                {
                    position += analyzer.getPositionIncrementGap(field.name());
                }

                // Build the termPositions vector for all terms

                tokens.reset(); // reset the TokenStream to the first token

                // set up token attributes we are working on

                // offsets
                OffsetAttribute offsetAttribute = null;
                if (field.isStoreOffsetWithTermVector())
                    offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);

                // positions
                // these are always gathered in later lucene versions
                PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) tokens
                            .addAttribute(PositionIncrementAttribute.class);

                // term as string
                CharTermAttribute termAttribute = (CharTermAttribute) tokens.addAttribute(CharTermAttribute.class);

                // store normalizations of field per term per document rather
                // than per field.
                // this adds more to write but less to read on other side
                Integer tokensInField = new Integer(0);

                while (tokens.incrementToken())
                {
                    tokensInField++;
                    Term term = new Term(field.name(), termAttribute.toString());

                    ThriftTerm tterm = new ThriftTerm(term.field()).setText(
                            ByteBuffer.wrap(term.text().getBytes("UTF-8"))).setIs_binary(false);

                    if (firstTerm == null)
                        firstTerm = tterm;

                    allIndexedTerms.addToTerms(tterm);

                    // fetch all collected information for this term
                    Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);

                    if (termInfo == null)
                    {
                        termInfo = new HashMap<ByteBuffer, List<Number>>();
                        allTermInformation.put(term, termInfo);
                    }

                    // term frequency
                    {
                        List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKeyBytes);

                        if (termFrequency == null)
                        {
                            termFrequency = new ArrayList<Number>();
                            termFrequency.add(new Integer(0));
                            termInfo.put(CassandraUtils.termFrequencyKeyBytes, termFrequency);
                        }

                        // increment
                        termFrequency.set(0, termFrequency.get(0).intValue() + 1);
                    }

                    // position vector
                    {
                        position += (posIncrAttribute.getPositionIncrement() - 1);

                        List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKeyBytes);

                        if (positionVector == null)
                        {
                            positionVector = new ArrayList<Number>();
                            termInfo.put(CassandraUtils.positionVectorKeyBytes, positionVector);
                        }

                        positionVector.add(++position);
                    }

                    // term offsets
                    if (field.isStoreOffsetWithTermVector())
                    {

                        List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKeyBytes);
                        if (offsetVector == null)
                        {
                            offsetVector = new ArrayList<Number>();
                            termInfo.put(CassandraUtils.offsetVectorKeyBytes, offsetVector);
                        }

                        offsetVector.add(lastOffset + offsetAttribute.startOffset());
                        offsetVector.add(lastOffset + offsetAttribute.endOffset());

                    }
                }

                List<Number> bnorm = null;
                if (!field.getOmitNorms())
                {
                    bnorm = new ArrayList<Number>();

                    final FieldInvertState invertState = new FieldInvertState();
                    invertState.setBoost(doc.getBoost() * field.getBoost());
                    invertState.setLength(tokensInField);
                    final float norm = similarity.computeNorm(field.name(), invertState);

                    bnorm.add(Similarity.getDefault().encodeNormValue(norm));
                }

                for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet())
                {

                    // Terms are stored within a unique key combination
                    // This is required since cassandra loads all columns
                    // in a key/column family into memory
                    ByteBuffer key = CassandraUtils.hashKeyBytes(indexNameBytes, CassandraUtils.delimeterBytes, term
                            .getKey().field().getBytes("UTF-8"), CassandraUtils.delimeterBytes, term.getKey().text()
                            .getBytes("UTF-8"));

                    // Mix in the norm for this field alongside each term
                    // more writes but faster on read side.
                    if (!field.getOmitNorms())
                    {
                        term.getValue().put(CassandraUtils.normsKeyBytes, bnorm);
                    }

                    CassandraUtils.addMutations(workingMutations, CassandraUtils.termVecColumnFamily, docId, key,
                            new LucandraTermInfo(docNumber, term.getValue()).serialize());

                    // Store all terms under a row
                    CassandraUtils.addMutations(workingMutations, CassandraUtils.metaInfoColumnFamily,
                            CassandraUtils.createColumnName(term.getKey()), indexTermsKey,
                            ByteBufferUtil.EMPTY_BYTE_BUFFER);
                }
            }

            // Untokenized fields go in without a termPosition
            if (field.isIndexed() && !field.isTokenized())
            {
                ThriftTerm tterm = new ThriftTerm(field.name()).setText(
                        ByteBuffer.wrap(field.stringValue().getBytes("UTF-8"))).setIs_binary(false);

                if (firstTerm == null)
                    firstTerm = tterm;

                allIndexedTerms.addToTerms(tterm);

                ByteBuffer key = CassandraUtils.hashKeyBytes(indexName.getBytes("UTF-8"),
                        CassandraUtils.delimeterBytes, field.name().getBytes("UTF-8"), CassandraUtils.delimeterBytes,
                        field.stringValue().getBytes("UTF-8"));

               
                CassandraUtils.addMutations(workingMutations, CassandraUtils.termVecColumnFamily, docId, key,
                        new LucandraTermInfo(docNumber, emptyTermMap).serialize());

                // Store all terms under a row
                CassandraUtils.addMutations(workingMutations, CassandraUtils.metaInfoColumnFamily,
                        CassandraUtils.createColumnName(field), indexTermsKey, ByteBufferUtil.EMPTY_BYTE_BUFFER);
            }

            // Stores each field as a column under this doc key
            if (field.isStored())
            {
                ThriftTerm tt = new ThriftTerm(field.name());

                if (field instanceof NumericField)
                {
                    Number n = ((NumericField) field).getNumericValue();
                    switch(((NumericField) field).getDataType())
                    {
                    case LONG: tt.setLongVal(n.longValue()); break;
                    case INT: tt.setIntVal(n.intValue()); break;
                    case FLOAT: tt.setFloatVal(n.floatValue()); break;
                    case DOUBLE: tt.setDoubleVal(n.doubleValue()); break;
                    default: throw new IllegalStateException("Unknown numeric type in field: "+field);
                    };
                }

                byte[] value = field.isBinary() ? field.getBinaryValue() : field.stringValue().getBytes("UTF-8");
                tt.setText(ByteBuffer.wrap(value)).setIs_binary(field.isBinary());

                // logic to handle multiple fields w/ same name
                DocumentMetadata currentValue = fieldCache.get(field.name());
                if (currentValue == null)
                {
                    currentValue = new DocumentMetadata();
                    fieldCache.put(field.name(), currentValue);
                }

                currentValue.addToTerms(tt);
            }

            // Store for field cache
            if (firstTerm != null)
            {
View Full Code Here

Examples of lucandra.serializers.thrift.DocumentMetadata

        IColumn metaCol = rows.get(0).cf.getColumn(CassandraUtils.documentMetaFieldBytes);
        if (metaCol == null)
            return;

        DocumentMetadata terms = fromBytesUsingThrift(metaCol.value());

        Set<String> fields = new HashSet<String>();

        for (ThriftTerm term : terms.getTerms())
        {
            // remove from field cache
            if (!fields.contains(term.getField()))
            {
                ByteBuffer fieldCacheKey = CassandraUtils.hashKeyBytes(indexNameBytes, CassandraUtils.delimeterBytes,
View Full Code Here

Examples of lucandra.serializers.thrift.DocumentMetadata

    }

    /** Read the object from bytes string. */
    public static DocumentMetadata fromBytesUsingThrift(ByteBuffer data) throws IOException
    {
        DocumentMetadata docMeta = new DocumentMetadata();

        byte[] decompressedData = CassandraUtils.decompress(ByteBufferUtil.getArray(data));

        TTransport trans = new TMemoryInputTransport(decompressedData);
        TProtocol deser = protocolFactory.getProtocol(trans);

        try
        {
            docMeta.read(deser);
        }
        catch (TException e)
        {
            throw new IOException(e);
        }
View Full Code Here

Examples of org.apache.ctakes.preprocessor.DocumentMetaData

     */
    public void testProcess()
    {
        try
        {
            DocumentMetaData dmd = iv_cnotePreProcessor.process(iv_cnoteXML);

            // validate document properties
            String docID = "000000000";
            String serviceCode = "MNT";
            Map docProperties = dmd.getMetaData();
            String cnote_docID =
                (String) docProperties.get(
                    ClinicalNotePreProcessor.MD_KEY_DOC_ID);
            String cnote_serviceCode =
                (String) docProperties.get(
View Full Code Here

Examples of org.apache.ctakes.preprocessor.DocumentMetaData

  public void process(JCas jcas) throws AnalysisEngineProcessException {

      logger.info(" process(JCas)");
   
    String originalText = null;
      DocumentMetaData dmd;

        try {
           
          JCas originalView = jcas.getView("_InitialView");
          originalText = originalView.getSofaDataString();

            PreProcessor pp = new ClinicalNotePreProcessor(
                    dtdFile,
                    includeSectionMarkers.booleanValue());
            dmd = pp.process(originalText);

            String text = dmd.getText();
            StringBuffer sb = new StringBuffer(text);

            applyTextModifier(text, sb);
           
            // Create a view (and its Sofa) to hold the plain text version of
            // the CDA document
            JCas plaintextView = jcas.createView("plaintext");          
            plaintextView.setDocumentText(sb.toString());
           
            // Add section (segment) annotations
            Iterator<String> segmentItr = (Iterator<String>)dmd.getSegmentIdentifiers().iterator();
            while (segmentItr.hasNext())
            {
                String segmentID = (String) segmentItr.next();
                SegmentMetaData smd = dmd.getSegment(segmentID);

                Segment sa = new Segment(plaintextView);
                sa.setBegin(smd.span.start);
                sa.setEnd(smd.span.end);
                sa.setId(smd.id);

                sa.addToIndexes();
            }
           
            // Store meta data about the document
            Pairs propAnnot = new Pairs(plaintextView);
            Map metaDataMap = dmd.getMetaData();
           
            String docID = (String)metaDataMap.get(ClinicalNotePreProcessor.MD_KEY_DOC_ID);
          if (docID!=null) {
              DocumentID newDocId = new DocumentID(plaintextView);
              newDocId.setDocumentID(docID);
View Full Code Here

Examples of org.apache.ctakes.preprocessor.DocumentMetaData

            }
           
            PreProcessor pp = new ClinicalNotePreProcessor(dtdFile, false);

            timestamp = System.currentTimeMillis();
            DocumentMetaData dmd = pp.process(hl7Text);
            elapsedTime = System.currentTimeMillis() - timestamp;
            System.out.println("PreProcessor Took " + elapsedTime + "ms");

            System.out.println("Plain Text Start");
            System.out.println(dmd.getText());
            System.out.println("Plain Text End");

            sectionNames = dmd.getSegmentIdentifiers();
            snItr = sectionNames.iterator();
            while (snItr.hasNext())
            {
                String sectionId = (String) snItr.next();
                SegmentMetaData smd = dmd.getSegment(sectionId);
                System.out.println("SECTION="
                        + sectionId
                        + "\tSTART_OFFSET="
                        + smd.span.start
                        + "\tEND_OFFSET="
                        + smd.span.end);
                //System.out.println(dmd.getText().substring(smd.span.start, smd.span.end));
            }

            Map metaDataMap = dmd.getMetaData();
            Iterator keyItr = metaDataMap.keySet().iterator();
            while (keyItr.hasNext())
            {
                Object key = keyItr.next();
                Object value = metaDataMap.get(key);
View Full Code Here

Examples of org.exist.dom.DocumentMetadata

                    w.flush();
                    w.close();
                   
                    is = new FileInputStream(tempFile);
                   
                    final DocumentMetadata meta = doc.getMetadata();
                   
                    final Date created = new Date(meta.getCreated());
                    final Date lastModified = new Date(meta.getLastModified());
   
                    BinaryDocument binary = destination.validateBinaryResource(txn, broker, newName, is, mimeType.getName(), -1, created, lastModified);
                   
                    binary = destination.addBinaryResource(txn, broker, binary, is, mimeType.getName(), -1, created, lastModified);
                   
View Full Code Here

Examples of org.exist.dom.DocumentMetadata

          // store as xml resource
         
          final IndexInfo info = currentCollection.validateXMLResource(txn, broker, docUri, is);
         
          resource = info.getDocument();
          final DocumentMetadata meta = resource.getMetadata();
          meta.setMimeType(mimetype);
          meta.setCreated(date_created.getTime());
          meta.setLastModified(date_modified.getTime());
         
                  if((publicid != null) || (systemid != null)) {
                    final DocumentType docType = new DocumentTypeImpl(namedoctype, publicid, systemid);
                    meta.setDocType(docType);
                  }

          rh.startDocumentRestore(resource, atts);

          currentCollection.store(txn, broker, info, is, false);
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.