Examples of com.carrotsearch.hppc.ByteArrayList

com.carrotsearch.hppc.ByteArrayList
An array-backed list of bytes. A single array is used to store and manipulate all elements. Reallocations are governed by a {@link ArraySizingStrategy}and may be expensive if they move around really large chunks of memory.
See {@link ObjectArrayList} class for API similarities and differences against JavaCollections.


    /* */
    @BeforeClass
    public static void before()
    {
        list = new ByteArrayList();
        list.resize(CELLS);
    }

View Full Code Here


        // Create holders for new arrays
        final List<char []> normalizedWordImages = Lists.newArrayList();
        final IntArrayList normalizedWordTf = new IntArrayList();
        final List<int []> wordTfByDocumentList = Lists.newArrayList();
        final ByteArrayList fieldIndexList = new ByteArrayList();
        final ShortArrayList types = new ShortArrayList();


        final int [] wordIndexes = new int [tokenCount];
        Arrays.fill(wordIndexes, -1);


        // Initial values for counters
        int tf = 1;
        int maxTf = 1;
        int maxTfVariantIndex = tokenImagesOrder[0];
        int totalTf = 1;
        int variantStartIndex = 0;


        // A byte set for word fields tracking
        final BitSet fieldIndices = new BitSet(context.allFields.name.length);


        // A stack for pushing information about the term's documents.
        final IntStack wordDocuments = new IntStack();


        if (documentIndexesArray[tokenImagesOrder[0]] >= 0)
        {
            wordDocuments.push(documentIndexesArray[tokenImagesOrder[0]]);
        }


        // Go through the ordered token images
        for (int i = 0; i < tokenImagesOrder.length - 1; i++)
        {
            final char [] image = tokenImages[tokenImagesOrder[i]];
            final char [] nextImage = tokenImages[tokenImagesOrder[i + 1]];
            final int tokenType = tokenTypesArray[tokenImagesOrder[i]];
            final int documentIndex = documentIndexesArray[tokenImagesOrder[i + 1]];


            // Reached the end of non-null tokens?
            if (image == null)
            {
                break;
            }


            // Check if we want to index this token at all
            if (isNotIndexed(tokenType))
            {
                variantStartIndex = i + 1;
                maxTfVariantIndex = tokenImagesOrder[i + 1];


                resetForNewTokenImage(documentIndexesArray, tokenImagesOrder, 
                    fieldIndices, wordDocuments, i);
                continue;
            }


            fieldIndices.set(tokensFieldIndex[tokenImagesOrder[i]]);


            // Now check if image case is changing
            final boolean sameCase = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR
                .compare(image, nextImage) == 0;
            if (sameCase)
            {
                // Case has not changed, just increase counters
                tf++;
                totalTf++;
                wordDocuments.push(documentIndex);
                continue;
            }


            // Case (or even token image) has changed. Update most frequent case
            // variant
            if (maxTf < tf)
            {
                maxTf = tf;
                maxTfVariantIndex = tokenImagesOrder[i];
                tf = 1;
            }


            final boolean sameImage = CharArrayComparators.CASE_INSENSITIVE_CHAR_ARRAY_COMPARATOR
                .compare(image, nextImage) == 0;


            // Check if token image has changed
            if (sameImage)
            {
                totalTf++;
                wordDocuments.push(documentIndex);
            }
            else
            {
                // The image has changed completely.
                // Before we start processing the new image, we need to
                // see if we want to store the previous image, and if so
                // we need add some data about it to the arrays
                
                // wordDocuments.size() may contain duplicate entries from the same document, 
                // but this check is faster than deduping, so we do it first.  
                if (wordDocuments.size() >= dfThreshold)
                {
                    // Flatten the list of documents this term occurred in.
                    final int [] sparseEncoding = SparseArray.toSparseEncoding(wordDocuments);
                    final int df = (sparseEncoding.length >> 1); 
                    if (df >= dfThreshold)
                    {
                        wordTfByDocumentList.add(sparseEncoding);
    
                        // Add the word to the word list
                        normalizedWordImages.add(tokenImages[maxTfVariantIndex]);
                        types.add(tokenTypesArray[maxTfVariantIndex]);
                        normalizedWordTf.add(totalTf);
                        fieldIndexList.add((byte) fieldIndices.bits[0]);


                        // Add this word's index in AllWords to all its instances
                        // in the AllTokens multiarray
                        for (int j = variantStartIndex; j < i + 1; j++)
                        {
                            wordIndexes[tokenImagesOrder[j]] = normalizedWordImages.size() - 1;
                        }
                    }
                }


                // Reinitialize counters
                totalTf = 1;
                tf = 1;
                maxTf = 1;
                maxTfVariantIndex = tokenImagesOrder[i + 1];
                variantStartIndex = i + 1;


                // Re-initialize int set used for document frequency calculation
                resetForNewTokenImage(documentIndexesArray, tokenImagesOrder,
                    fieldIndices, wordDocuments, i);
            }
        }


        // Mapping from allTokens
        context.allTokens.wordIndex = wordIndexes;


        context.allWords.image = normalizedWordImages
            .toArray(new char [normalizedWordImages.size()] []);
        context.allWords.tf = normalizedWordTf.toArray();
        context.allWords.tfByDocument = 
            wordTfByDocumentList.toArray(new int [wordTfByDocumentList.size()] []);
        context.allWords.fieldIndices = fieldIndexList.toArray();
        context.allWords.type = types.toArray();
    }

View Full Code Here


        // Prepare arrays
        images = Lists.newArrayList();
        tokenTypes = new ShortArrayList();
        documentIndices = new IntArrayList();
        fieldIndices = new ByteArrayList();


        final Iterator<Document> docIterator = documents.iterator();
        int documentIndex = 0;
        final ITokenizer ts = context.language.getTokenizer();
        final MutableCharArray wrapper = new MutableCharArray(CharArrayUtils.EMPTY_ARRAY);

View Full Code Here

        // Lists to accommodate the results
        final ArrayList<char []> stemImages = new ArrayList<char []>(allWordsCount);
        final IntArrayList stemTf = new IntArrayList(allWordsCount);
        final IntArrayList stemMostFrequentWordIndexes = new IntArrayList(allWordsCount);
        final ArrayList<int []> stemTfByDocumentList = new ArrayList<int []>(allWordsCount);
        final ByteArrayList fieldIndexList = new ByteArrayList();


        // Counters
        int totalTf = wordTfArray[stemImagesOrder[0]];
        int mostFrequentWordFrequency = wordTfArray[stemImagesOrder[0]];
        int mostFrequentWordIndex = stemImagesOrder[0];
        int stemIndex = 0;


        // A list of document-term-frequency pairs, by document, for all words with identical stems.
        final ArrayList<int[]> stemTfsByDocument = Lists.newArrayList();
        
        stemTfsByDocument.add(wordTfByDocumentArray[stemImagesOrder[0]]);
        byte fieldIndices = 0;
        fieldIndices |= wordsFieldIndices[0];


        // For locating query words
        final MutableCharArray buffer = new MutableCharArray(
            wordStemImages[stemImagesOrder[0]]);
        boolean inQuery = queryStems.contains(buffer);


        // Go through all words in the order of stem images
        for (int i = 0; i < stemImagesOrder.length - 1; i++)
        {
            final int orderIndex = stemImagesOrder[i];
            final char [] stem = wordStemImages[orderIndex];
            final int nextInOrderIndex = stemImagesOrder[i + 1];
            final char [] nextStem = wordStemImages[nextInOrderIndex];


            stemIndexesArray[orderIndex] = stemIndex;
            if (inQuery)
            {
                wordsType[orderIndex] |= ITokenizer.TF_QUERY_WORD;
            }


            // Now check if token image is changing
            final boolean sameStem = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR
                .compare(stem, nextStem) == 0;


            if (sameStem)
            {
                totalTf += wordTfArray[nextInOrderIndex];
                stemTfsByDocument.add(wordTfByDocumentArray[nextInOrderIndex]);
                fieldIndices |= wordsFieldIndices[nextInOrderIndex];
                if (mostFrequentWordFrequency < wordTfArray[nextInOrderIndex])
                {
                    mostFrequentWordFrequency = wordTfArray[nextInOrderIndex];
                    mostFrequentWordIndex = nextInOrderIndex;
                }
            }
            else
            {
                stemImages.add(stem);
                stemTf.add(totalTf);
                stemMostFrequentWordIndexes.add(mostFrequentWordIndex);
                storeTfByDocument(stemTfByDocumentList, stemTfsByDocument);
                fieldIndexList.add(fieldIndices);


                stemIndex++;
                totalTf = wordTfArray[nextInOrderIndex];
                mostFrequentWordFrequency = wordTfArray[nextInOrderIndex];
                mostFrequentWordIndex = nextInOrderIndex;
                fieldIndices = 0;
                fieldIndices |= wordsFieldIndices[nextInOrderIndex];


                stemTfsByDocument.clear();
                stemTfsByDocument.add(wordTfByDocumentArray[nextInOrderIndex]);


                buffer.reset(wordStemImages[nextInOrderIndex]);
                inQuery = queryStems.contains(buffer);
            }
        }


        // Store tf for the last stem in the array
        stemImages.add(wordStemImages[stemImagesOrder[stemImagesOrder.length - 1]]);
        stemTf.add(totalTf);
        stemMostFrequentWordIndexes.add(mostFrequentWordIndex);
        stemIndexesArray[stemImagesOrder[stemImagesOrder.length - 1]] = stemIndex;
        storeTfByDocument(stemTfByDocumentList, stemTfsByDocument);
        fieldIndexList.add(fieldIndices);
        if (inQuery)
        {
            wordsType[stemImagesOrder[stemImagesOrder.length - 1]] |= ITokenizer.TF_QUERY_WORD;
        }


        // Convert lists to arrays and store them in allStems
        context.allStems.image = stemImages.toArray(new char [stemImages.size()] []);
        context.allStems.mostFrequentOriginalWordIndex = stemMostFrequentWordIndexes
            .toArray();
        context.allStems.tf = stemTf.toArray();
        context.allStems.tfByDocument = stemTfByDocumentList
            .toArray(new int [stemTfByDocumentList.size()] []);
        context.allStems.fieldIndices = fieldIndexList.toArray();


        // References in allWords
        context.allWords.stemIndex = stemIndexesArray;
    }

View Full Code Here

TOP

Related Classes of com.carrotsearch.hppc.ByteArrayList

com.carrotsearch.hppc.hash.ByteHashFunction

com.carrotsearch.hppc.jub.IterationSpeedBenchmark

org.carrot2.text.preprocessing.CaseNormalizer

org.carrot2.text.preprocessing.LanguageModelStemmer

org.carrot2.text.preprocessing.Tokenizer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.