Examples of MorphData

com.code972.hebmorph.MorphData

Examples of com.code972.hebmorph.MorphData

    }


    public static WordType isRecognizedWord(final String word, final boolean tolerate) {
        byte prefLen = 0;
        Integer prefixMask;
        MorphData md;


        if (customWords != null) {
            try {
                if (customWords.lookup(word) != null) return WordType.CUSTOM;
            } catch (IllegalArgumentException e) {
            }


            while (true) {
                // Make sure there are at least 2 letters left after the prefix (the words של, שלא for example)
                if (word.length() - prefLen < 2)
                    break;


                try {
                    prefixMask = prefixesTree.lookup(word.substring(0, ++prefLen));
                } catch (IllegalArgumentException e) {
                    break;
                }


                try {
                    md = customWords.lookup(word.substring(prefLen));
                } catch (IllegalArgumentException e) {
                    md = null;
                }
                if ((md != null) && ((md.getPrefixes() & prefixMask) > 0)) {
                    for (int result = 0; result < md.getLemmas().length; result++) {
                        if ((LingInfo.DMask2ps(md.getDescFlags()[result]) & prefixMask) > 0) {
                            return WordType.CUSTOM_WITH_PREFIX;
                        }
                    }
                }
            }
        }


        if (!isHebrewWord(word))
            return WordType.NON_HEBREW;


        try {
            if (dictRadix.lookup(word) != null) return WordType.HEBREW;
        } catch (IllegalArgumentException e) {
        }


        if (word.endsWith("'")) { // Try ommitting closing Geresh
            try {
                if (dictRadix.lookup(word.substring(0, word.length() - 1)) != null) return WordType.HEBREW;
            } catch (IllegalArgumentException e) {
            }
        }


        prefLen = 0;
        while (true) {
            // Make sure there are at least 2 letters left after the prefix (the words של, שלא for example)
            if (word.length() - prefLen < 2)
                break;


            try {
                prefixMask = prefixesTree.lookup(word.substring(0, ++prefLen));
            } catch (IllegalArgumentException e) {
                break;
            }


            try {
                md = dictRadix.lookup(word.substring(prefLen));
            } catch (IllegalArgumentException e) {
                md = null;
            }
            if ((md != null) && ((md.getPrefixes() & prefixMask) > 0)) {
                for (int result = 0; result < md.getLemmas().length; result++) {
                    if ((LingInfo.DMask2ps(md.getDescFlags()[result]) & prefixMask) > 0) {
                        return WordType.HEBREW_WITH_PREFIX;
                    }
                }
            }
        }

View Full Code Here

Examples of com.code972.hebmorph.MorphData

    }


    public static WordType isRecognizedWord(final String word, final boolean tolerate) {
        byte prefLen = 0;
        Integer prefixMask;
        MorphData md;


        if (customWords != null) {
            try {
                if (customWords.lookup(word) != null) return WordType.CUSTOM;
            } catch (IllegalArgumentException ignored_ex) {
            }


            while (true) {
                // Make sure there are at least 2 letters left after the prefix (the words של, שלא for example)
                if (word.length() - prefLen < 2)
                    break;


                try {
                    prefixMask = prefixesTree.lookup(word.substring(0, ++prefLen));
                } catch (IllegalArgumentException ignored_ex) {
                    break;
                }


                try {
                    md = customWords.lookup(word.substring(prefLen));
                } catch (IllegalArgumentException ignored_ex) {
                    md = null;
                }
                if ((md != null) && ((md.getPrefixes() & prefixMask) > 0)) {
                    for (int result = 0; result < md.getLemmas().length; result++) {
                        if ((LingInfo.DMask2ps(md.getDescFlags()[result]) & prefixMask) > 0) {
                            return WordType.CUSTOM_WITH_PREFIX;
                        }
                    }
                }
            }
        }


        if (!isHebrewWord(word))
            return WordType.NON_HEBREW;


        try {
            if (dictRadix.lookup(word) != null) return WordType.HEBREW;
        } catch (IllegalArgumentException ignored_ex) {
        }


        if (word.endsWith("'")) { // Try ommitting closing Geresh
            try {
                if (dictRadix.lookup(word.substring(0, word.length() - 1)) != null) return WordType.HEBREW;
            } catch (IllegalArgumentException ignored_ex) {
            }
        }


        prefLen = 0;
        while (true) {
            // Make sure there are at least 2 letters left after the prefix (the words של, שלא for example)
            if (word.length() - prefLen < 2)
                break;


            try {
                prefixMask = prefixesTree.lookup(word.substring(0, ++prefLen));
            } catch (IllegalArgumentException e) {
                break;
            }


            try {
                md = dictRadix.lookup(word.substring(prefLen));
            } catch (IllegalArgumentException e) {
                md = null;
            }
            if ((md != null) && ((md.getPrefixes() & prefixMask) > 0)) {
                for (int result = 0; result < md.getLemmas().length; result++) {
                    if ((LingInfo.DMask2ps(md.getDescFlags()[result]) & prefixMask) > 0) {
                        return WordType.HEBREW_WITH_PREFIX;
                    }
                }
            }
        }

View Full Code Here

Examples of com.code972.hebmorph.MorphData

            }


            final DictRadix<MorphData> ret = new DictRadix<MorphData>();
            try {
                for (int i = 0; lookup[i] != null; i++) {
                    MorphData data = new MorphData();
                    data.setPrefixes((short) fprefixes.read()); // Read prefix hint byte
                    data.setDescFlags(readDescFile(fdesc));


                    final List<Integer> stemReferences = readStemFile(fstem);
                    final String[] lemmas = new String[stemReferences.size()];
                    int stemPosition = 0;
                    for (int r : stemReferences) {
                        // This is a bypass for the psuedo-stem "שונות", as defined by hspell
                        // TODO: Try looking into changing this in hspell itself
                        if (lookup[r].equals("שונות") && !lookup[r].equals(lookup[i])) {
                            lemmas[stemPosition++] = null;
                        } else {
                            lemmas[stemPosition++] = lookup[r];
                        }
                    }
                    data.setLemmas(lemmas);
                    ret.addNode(lookup[i], data);
                }
            } finally {
                if (fprefixes != null) try { fprefixes.close(); } catch (IOException ignored) {}
                if (fdesc != null) try { fdesc.close(); } catch (IOException ignored) {}
                if (fstem != null) try { fstem.close(); } catch (IOException ignored) {}
            }


      return ret;


    } else { // Use optimized version for loading HSpell's dictionary files
      DictRadix<MorphData> ret = new DictRadix<MorphData>();


            InputStream fprefixes = null, fdict = null;
            try {
                final char[] sbuf = new char[Constants.MaxWordLength];
                int c = 0, n, slen = 0;
                while ((c = fdict.read()) > -1) {
                    if ((c >= '0') && (c <= '9')) { // No conversion required for chars < 0xBE
                        // new word - finalize old word first (set value)
                        sbuf[slen] = '\0';


                        // TODO: Avoid creating new MorphData object, and enhance DictRadix to store
                        // the prefixes mask in the node itself
                        MorphData data = new MorphData();
                        data.setPrefixes((short) fprefixes.read()); // Read prefix hint byte
                        ret.addNode(sbuf, data);


                        // and read how much to go back
                        n = 0;
                        do {

View Full Code Here

Examples of com.code972.hebmorph.MorphData

        while ((line = input.readLine()) != null) {
            String[] cells = line.split(" ");
            if (cells.length < 2)
                continue;


            MorphData md = null;
            switch (cells[1]) {
                case "שםעצם":
                    md = new MorphData();
                    md.setPrefixes((short) 63);
                    md.setLemmas(new String[]{cells[0]});
                    md.setDescFlags(descFlags_noun);
                    break;
                case "שםחברה":
                case "שםפרטי":
                    md = new MorphData();
                    md.setPrefixes((short) 8);
                    md.setLemmas(new String[]{cells[0]});
                    md.setDescFlags(descFlags_person_name);
                    break;
                case "שםמקום":
                    md = new MorphData();
                    md.setPrefixes((short) 8);
                    md.setLemmas(new String[]{cells[0]});
                    md.setDescFlags(descFlags_place_name);
                    break;
                case "שםמדויק":
                    md = new MorphData();
                    md.setPrefixes((short) 0);
                    md.setLemmas(new String[]{cells[0]});
                    md.setDescFlags(descFlags_empty);
                    break;
            }


            if (md == null) { // allow to associate new entries with other custom entries
                try {
                    md = custom.lookup(cells[1], false);
                } catch (IllegalArgumentException ignored_ex) {
                }
            }


            if (md == null) {
                try {
                    md = dictRadix.lookup(cells[1], false);
                } catch (IllegalArgumentException ignored_ex) {
                }
            }


            if (md != null) {
                custom.addNode(cells[0], md);
            } else {
                secondPass.put(cells[0], cells[1]);
            }
        }


        for (final Map.Entry<String, String> entry : secondPass.entrySet()) {
            try {
                custom.lookup(entry.getKey(), false);
                continue; // we already stored this word somehow
            } catch (IllegalArgumentException expected_ex) {
            }


            try {
                final MorphData md = custom.lookup(entry.getValue(), false);
                if (md != null) custom.addNode(entry.getKey(), md);
            } catch (IllegalArgumentException ignored_ex) {
            }
        }

View Full Code Here

Examples of com.code972.hebmorph.MorphData

            while ((str = bufferedReader.readLine()) != null) {
                String[] split = str.split(DELIMETER); // 0=value,1=prefix,2=lemmas,3=descFlags
                if (split.length != 4) {
                    throw new IOException("Wrong format detected");
                }
                MorphData md = new MorphData();
                md.setPrefixes(Short.parseShort(split[1]));
                String[] lemmaStrings = split[2].split(",");
                String[] descStrings = split[3].split(",");
                if (lemmaStrings.length != descStrings.length) {
                    throw new IOException("Number of lemmas does not match number of descFlags");
                }
                MorphData.Lemma[] lemmas = new MorphData.Lemma[lemmaStrings.length];


                for (int i = 0; i < lemmas.length; i++) { //null and "null" are read the same
                    String lem = lemmaStrings[i].equals("null")?null:lemmaStrings[i];
                    lemmas[i] = new MorphData.Lemma(lem,Integer.parseInt(descStrings[i]));
                }
                md.setLemmas(lemmas);
                dict.addNode(split[0], md);
            }
        } finally {
            if (bufferedReader != null) try {
                bufferedReader.close();

View Full Code Here

Examples of com.code972.hebmorph.MorphData

            }


            final DictRadix<MorphData> ret = new DictRadix<MorphData>();
            try {
                for (int i = 0; lookup[i] != null; i++) {
                    MorphData data = new MorphData();
                    data.setPrefixes((short) fprefixes.read()); // Read prefix hint byte
                    Integer[] descFlags = readDescFile(fdesc);


                    final List<Integer> stemReferences = readStemFile(fstem);
                    final MorphData.Lemma[] lemmas = new MorphData.Lemma[stemReferences.size()];
                    int stemPosition = 0;
                    for (int r : stemReferences) {
                        String lemma;
                        // This is a bypass for the psuedo-stem "שונות", as defined by hspell
                        // TODO: Try looking into changing this in hspell itself
                        if (lookup[r].equals("שונות") && !lookup[r].equals(lookup[i])) {
                            lemma = null;
                        } else {
                            lemma = lookup[r];
                        }
                        lemmas[stemPosition] = new MorphData.Lemma(lemma, descFlags[stemPosition]);
                        stemPosition++;
                    }
                    data.setLemmas(lemmas);
                    ret.addNode(lookup[i], data);
                }
            } finally {
                if (fprefixes != null) try {
                    fprefixes.close();
                } catch (IOException ignored) {
                }
                if (fdesc != null) try {
                    fdesc.close();
                } catch (IOException ignored) {
                }
                if (fstem != null) try {
                    fstem.close();
                } catch (IOException ignored) {
                }
            }


            return ret;


        } else { // Use optimized version for loading HSpell's dictionary files
            DictRadix<MorphData> ret = new DictRadix<MorphData>();


            try {
                final char[] sbuf = new char[Constants.MaxWordLength];
                int c = 0, n, slen = 0;
                while ((c = fdict.read()) > -1) {
                    if ((c >= '0') && (c <= '9')) { // No conversion required for chars < 0xBE
                        // new word - finalize old word first (set value)
                        sbuf[slen] = '\0';


                        // TODO: Avoid creating new MorphData object, and enhance DictRadix to store
                        // the prefixes mask in the node itself
                        MorphData data = new MorphData();
                        data.setPrefixes((short) fprefixes.read()); // Read prefix hint byte
                        ret.addNode(sbuf, data);


                        // and read how much to go back
                        n = 0;
                        do {

View Full Code Here

Examples of com.code972.hebmorph.MorphData

        while ((line = input.readLine()) != null) {
            String[] cells = line.split(" ");
            if (cells.length < 2)
                continue;


            MorphData md = null;
            switch (cells[1]) {
                case "שםעצם":
                    md = new MorphData();
                    md.setPrefixes((short) 63);
                    md.setLemmas(new MorphData.Lemma[]{new MorphData.Lemma(cells[0], descFlags_noun)});
                    break;
                case "שםחברה":
                case "שםפרטי":
                    md = new MorphData();
                    md.setPrefixes((short) 8);
                    md.setLemmas(new MorphData.Lemma[]{new MorphData.Lemma(cells[0], descFlags_person_name)});
                    break;
                case "שםמקום":
                    md = new MorphData();
                    md.setPrefixes((short) 8);
                    md.setLemmas(new MorphData.Lemma[]{new MorphData.Lemma(cells[0], descFlags_place_name)});
                    break;
                case "שםמדויק":
                    md = new MorphData();
                    md.setPrefixes((short) 0);
                    md.setLemmas(new MorphData.Lemma[]{new MorphData.Lemma(cells[0], descFlags_empty)});
                    break;
            }


            if (md == null) { // allow to associate new entries with other custom entries
                try {
                    md = custom.lookup(cells[1], false);
                } catch (IllegalArgumentException ignored_ex) {
                }
            }


            if (md == null) {
                try {
                    md = dictRadix.lookup(cells[1], false);
                } catch (IllegalArgumentException ignored_ex) {
                }
            }


            if (md != null) {
                custom.addNode(cells[0], md);
            } else {
                secondPass.put(cells[0], cells[1]);
            }
        }


        for (final Map.Entry<String, String> entry : secondPass.entrySet()) {
            try {
                custom.lookup(entry.getKey(), false);
                continue; // we already stored this word somehow
            } catch (IllegalArgumentException expected_ex) {
            }


            try {
                final MorphData md = custom.lookup(entry.getValue(), false);
                if (md != null) custom.addNode(entry.getKey(), md);
            } catch (IllegalArgumentException ignored_ex) {
            }
        }

View Full Code Here

Examples of com.code972.hebmorph.MorphData

            }


            final DictRadix<MorphData> ret = new DictRadix<MorphData>();
            try {
                for (int i = 0; lookup[i] != null; i++) {
                    MorphData data = new MorphData();
                    data.setPrefixes((short) fprefixes.read()); // Read prefix hint byte
                    data.setDescFlags(readDescFile(fdesc));


                    final List<Integer> stemReferences = readStemFile(fstem);
                    final String[] lemmas = new String[stemReferences.size()];
                    int stemPosition = 0;
                    for (int r : stemReferences) {
                        // This is a bypass for the psuedo-stem "שונות", as defined by hspell
                        // TODO: Try looking into changing this in hspell itself
                        if (lookup[r].equals("שונות") && !lookup[r].equals(lookup[i])) {
                            lemmas[stemPosition++] = null;
                        } else {
                            lemmas[stemPosition++] = lookup[r];
                        }
                    }
                    data.setLemmas(lemmas);
                    ret.addNode(lookup[i], data);
                }
            } finally {
                if (fprefixes != null) try { fprefixes.close(); } catch (IOException ignored) {}
                if (fdesc != null) try { fdesc.close(); } catch (IOException ignored) {}
                if (fstem != null) try { fstem.close(); } catch (IOException ignored) {}
            }


      return ret;


    } else { // Use optimized version for loading HSpell's dictionary files
      DictRadix<MorphData> ret = new DictRadix<MorphData>();


            try {
                final char[] sbuf = new char[Constants.MaxWordLength];
                int c = 0, n, slen = 0;
                while ((c = fdict.read()) > -1) {
                    if ((c >= '0') && (c <= '9')) { // No conversion required for chars < 0xBE
                        // new word - finalize old word first (set value)
                        sbuf[slen] = '\0';


                        // TODO: Avoid creating new MorphData object, and enhance DictRadix to store
                        // the prefixes mask in the node itself
                        MorphData data = new MorphData();
                        data.setPrefixes((short) fprefixes.read()); // Read prefix hint byte
                        ret.addNode(sbuf, data);


                        // and read how much to go back
                        n = 0;
                        do {

View Full Code Here

Examples of com.code972.hebmorph.MorphData

        while ((line = input.readLine()) != null) {
            String[] cells = line.split(" ");
            if (cells.length < 2)
                continue;


            MorphData md = null;
            switch (cells[1]) {
                case "שםעצם":
                    md = new MorphData();
                    md.setPrefixes((short) 63);
                    md.setLemmas(new String[]{cells[0]});
                    md.setDescFlags(descFlags_noun);
                    break;
                case "שםחברה":
                case "שםפרטי":
                    md = new MorphData();
                    md.setPrefixes((short) 8);
                    md.setLemmas(new String[]{cells[0]});
                    md.setDescFlags(descFlags_person_name);
                    break;
                case "שםמקום":
                    md = new MorphData();
                    md.setPrefixes((short) 8);
                    md.setLemmas(new String[]{cells[0]});
                    md.setDescFlags(descFlags_place_name);
                    break;
                case "שםמדויק":
                    md = new MorphData();
                    md.setPrefixes((short) 0);
                    md.setLemmas(new String[]{cells[0]});
                    md.setDescFlags(descFlags_empty);
                    break;
            }


            if (md == null) { // allow to associate new entries with other custom entries
                try {
                    md = custom.lookup(cells[1], false);
                } catch (IllegalArgumentException ignored_ex) {
                }
            }


            if (md == null) {
                try {
                    md = dictRadix.lookup(cells[1], false);
                } catch (IllegalArgumentException ignored_ex) {
                }
            }


            if (md != null) {
                custom.addNode(cells[0], md);
            } else {
                secondPass.put(cells[0], cells[1]);
            }
        }


        for (final Map.Entry<String, String> entry : secondPass.entrySet()) {
            try {
                custom.lookup(entry.getKey(), false);
                continue; // we already stored this word somehow
            } catch (IllegalArgumentException expected_ex) {
            }


            try {
                final MorphData md = custom.lookup(entry.getValue(), false);
                if (md != null) custom.addNode(entry.getKey(), md);
            } catch (IllegalArgumentException ignored_ex) {
            }
        }

View Full Code Here

Examples of com.code972.hebmorph.MorphData

                fdesc = new GZIPInputStream(new FileInputStream(new File(hspellFolder, Constants.descFile)));
                fstem = new GZIPInputStream(new FileInputStream(new File(hspellFolder, Constants.stemsFile)));


                final Loader loader = new Loader(hspellFolder);
                for (int i = 0; lookup[i] != null; i++) {
                    MorphData data = new MorphData();
                    data.setPrefixes((short) fprefixes.read()); // Read prefix hint byte
                    data.setDescFlags(loader.readDescFile(fdesc));


                    final List<Integer> stemReferences = loader.readStemFile(fstem);
                    final String[] lemmas = new String[stemReferences.size()];
                    int stemPosition = 0;
                    for (int r : stemReferences) {
                        // This is a bypass for the psuedo-stem "שונות", as defined by hspell
                        // TODO: Try looking into changing this in hspell itself
                        if (lookup[r].equals("שונות") && !lookup[r].equals(lookup[i])) {
                            lemmas[stemPosition++] = null;
                        } else {
                            lemmas[stemPosition++] = lookup[r];
                        }
                    }
                    data.setLemmas(lemmas);
                    ret.addNode(lookup[i], data);
                }
            } finally {
                if (fprefixes != null) try { fprefixes.close(); } catch (IOException ignored) {}
                if (fdesc != null) try { fdesc.close(); } catch (IOException ignored) {}
                if (fstem != null) try { fstem.close(); } catch (IOException ignored) {}
            }


      return ret;


    } else { // Use optimized version for loading HSpell's dictionary files
      DictRadix<MorphData> ret = new DictRadix<MorphData>();


            InputStream fprefixes = null, fdict = null;
            try {
                fdict = new GZIPInputStream(new FileInputStream(new File(hspellFolder, Constants.dictionaryFile)));
                fprefixes = new GZIPInputStream(new FileInputStream(new File(hspellFolder, Constants.prefixesFile)));


                final char[] sbuf = new char[Constants.MaxWordLength];
                int c = 0, n, slen = 0;
                while ((c = fdict.read()) > -1) {
                    if ((c >= '0') && (c <= '9')) { // No conversion required for chars < 0xBE
                        // new word - finalize old word first (set value)
                        sbuf[slen] = '\0';


                        // TODO: Avoid creating new MorphData object, and enhance DictRadix to store
                        // the prefixes mask in the node itself
                        MorphData data = new MorphData();
                        data.setPrefixes((short) fprefixes.read()); // Read prefix hint byte
                        ret.addNode(sbuf, data);


                        // and read how much to go back
                        n = 0;
                        do {

View Full Code Here

0 1

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.