Package net.yacy.kelondro.data.word

Examples of net.yacy.kelondro.data.word.Word


                                urlModified.getTime(),
                                System.currentTimeMillis(),
                                UTF8.getBytes(language),
                                doctype,
                                outlinksSame, outlinksOther);
        Word wprop;
        byte[] wordhash;
        while (i.hasNext()) {
            wentry = i.next();
            word = wentry.getKey();
            wprop = wentry.getValue();
View Full Code Here


        }
        */

        // check if the token appears in the text
        if (words.containsKey(token)) {
          final Word word = words.get(token);
          // token appears in text and matches an existing bookmark tag
          if (tags.containsKey(token)) {
            count = word.occurrences() * tags.get(token).size() * 100;
          }
          // token appears in text and has more than 3 characters
          if (token.length()>3) {
            count = word.occurrences() * 100;
          }
          topwords.add(new YMarkTag(token, count));
        }
      }
      count = 0;
View Full Code Here

  public YMarkWordCountComparator(final Map<String,Word> words) {
    this.words = words;
  }
 
  public int compare(final String k1, final String k2) {
    final Word w1 = this.words.get(k1);
    final Word w2 = this.words.get(k2);
   
        if(w1.occurrences() > w2.occurrences())
            return 1;
        else if(w1.occurrences() < w2.occurrences())
            return -1;
        else
            return 0;
  }
View Full Code Here

                insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true, meaningLib);
            }
       
            // finally check all words for missing flag entry
            final Iterator<Map.Entry<String, Word>> k = words.entrySet().iterator();
            Word wprop;
            Map.Entry<String, Word> we;
            while (k.hasNext()) {
                we = k.next();
                wprop = we.getValue();
                if (wprop.flags == null) {
View Full Code Here

            final Bitfield flagstemplate,
            final boolean useForLanguageIdentification,
            final WordCache meaningLib) {
        if (text == null) return;
        String word;
        Word wprop;
        WordTokenizer wordenum;
        wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib);
        int pip = 0;
        while (wordenum.hasMoreElements()) {
            word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
            if (useForLanguageIdentification) languageIdentificator.add(word);
            if (word.length() < 2) continue;
            wprop = words.get(word);
            if (wprop == null) wprop = new Word(0, pip, phrase);
            if (wprop.flags == null) wprop.flags = flagstemplate.clone();
            wprop.flags.set(flagpos, true);
            words.put(word, wprop);
            pip++;
            this.RESULT_NUMB_WORDS++;
View Full Code Here

        assert is != null;
        final Set<String> currsentwords = new HashSet<String>();
        String word = "";
        String k;
        int wordlen;
        Word wsp, wsp1;
        int wordHandle;
        int wordHandleCount = 0;
        int sentenceHandleCount = 0;
        int allwordcounter = 0;
        int allsentencecounter = 0;
        int wordInSentenceCounter = 1;
        boolean comb_indexof = false, last_last = false, last_index = false;
        final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
       
        // read source
        final WordTokenizer wordenum = new WordTokenizer(is, meaningLib);
        while (wordenum.hasMoreElements()) {
            word = wordenum.nextElement().toLowerCase(Locale.ENGLISH);
            if (languageIdentificator != null) languageIdentificator.add(word);
            if (word.length() < wordminsize) continue;
           
            // distinguish punctuation and words
            wordlen = word.length();
            if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
                // store sentence
                currsentwords.clear();
                wordInSentenceCounter = 1;
            } else {
                // check index.of detection
                if (last_last && comb_indexof && word.equals("modified")) {
                    this.RESULT_FLAGS.set(flag_cat_indexof, true);
                    wordenum.pre(true); // parse lines as they come with CRLF
                }
                if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
                last_last = word.equals("last");
                last_index = word.equals("index");
               
                // store word
                allwordcounter++;
                currsentwords.add(word);
                wsp = words.get(word);
                if (wsp != null) {
                    // word already exists
                    wordHandle = wsp.posInText;
                    wsp.inc();
                } else {
                    // word does not yet exist, create new word entry
                    wordHandle = wordHandleCount++;
                    wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
                    wsp.flags = RESULT_FLAGS.clone();
                    words.put(word, wsp);
                }
                // we now have the unique handle of the word, put it into the sentence:
                wordInSentenceCounter++;
View Full Code Here

        score = 0;
        token = tokens.nextElement();
       
        // check if the token appears in the text
        if (words.containsKey(token.toString())) {         
          final Word word = words.get(token.toString());
          // token appears in text and matches an existing bookmark tag
          if (tags.containsKey(token.toString())) {
            score = word.occurrences() * tags.get(token.toString()).size() * 200;
          }
          // token appears in text and has more than 3 characters
          else if (token.length()>3) {
            score = word.occurrences() * 100;
          }
          // if token is already part of a phrase, reduce score
          if(pwords.toString().indexOf(token.toString())>1) {
            score = score / 3;
          }
View Full Code Here

                insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
            }

            // finally check all words for missing flag entry
            final Iterator<Map.Entry<String, Word>> k = this.words.entrySet().iterator();
            Word wprop;
            Map.Entry<String, Word> we;
            while (k.hasNext()) {
                we = k.next();
                wprop = we.getValue();
                if (wprop.flags == null) {
View Full Code Here

            final Bitfield flagstemplate,
            final boolean useForLanguageIdentification,
            final WordCache meaningLib) {
        if (text == null) return;
        String word;
        Word wprop;
        WordTokenizer wordenum;
        wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib);
        int pip = 0;
        while (wordenum.hasMoreElements()) {
            word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
            if (useForLanguageIdentification) this.languageIdentificator.add(word);
            if (word.length() < 2) continue;
            wprop = this.words.get(word);
            if (wprop == null) wprop = new Word(0, pip, phrase);
            if (wprop.flags == null) wprop.flags = flagstemplate.clone();
            wprop.flags.set(flagpos, true);
            this.words.put(word, wprop);
            pip++;
            this.RESULT_NUMB_WORDS++;
View Full Code Here

        assert is != null;
        final Set<String> currsentwords = new HashSet<String>();
        String word = "";
        String k;
        int wordlen;
        Word wsp;
        final Word wsp1;
        int wordHandle;
        int wordHandleCount = 0;
        final int sentenceHandleCount = 0;
        int allwordcounter = 0;
        final int allsentencecounter = 0;
        int wordInSentenceCounter = 1;
        boolean comb_indexof = false, last_last = false, last_index = false;
        final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);

        // read source
        final WordTokenizer wordenum = new WordTokenizer(is, meaningLib);
        while (wordenum.hasMoreElements()) {
            word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
            if (this.languageIdentificator != null) this.languageIdentificator.add(word);
            if (word.length() < wordminsize) continue;

            // distinguish punctuation and words
            wordlen = word.length();
            if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
                // store sentence
                currsentwords.clear();
                wordInSentenceCounter = 1;
            } else {
                // check index.of detection
                if (last_last && comb_indexof && word.equals("modified")) {
                    this.RESULT_FLAGS.set(flag_cat_indexof, true);
                    wordenum.pre(true); // parse lines as they come with CRLF
                }
                if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
                last_last = word.equals("last");
                last_index = word.equals("index");

                // store word
                allwordcounter++;
                currsentwords.add(word);
                wsp = this.words.get(word);
                if (wsp != null) {
                    // word already exists
                    wordHandle = wsp.posInText;
                    wsp.inc();
                } else {
                    // word does not yet exist, create new word entry
                    wordHandle = wordHandleCount++;
                    wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
                    wsp.flags = this.RESULT_FLAGS.clone();
                    this.words.put(word, wsp);
                }
                // we now have the unique handle of the word, put it into the sentence:
                wordInSentenceCounter++;
View Full Code Here

TOP

Related Classes of net.yacy.kelondro.data.word.Word

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.