Source Code of com.code972.hebmorph.hspell.Loader

/***************************************************************************
 *   Copyright (C) 2010-2013 by                                            *
 *      Itamar Syn-Hershko <itamar at code972 dot com>                     *
 *    Ofer Fort <oferiko at gmail dot com> (initial Java port)           *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU Affero General Public License           *
 *   version 3, as published by the Free Software Foundation.              *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU Affero General Public License for more details.                   *
 *                                                                         *
 *   You should have received a copy of the GNU Affero General Public      *
 *   License along with this program; if not, see                          *
 *   <http://www.gnu.org/licenses/>.                                       *
 **************************************************************************/
package com.code972.hebmorph.hspell;


import com.code972.hebmorph.MorphData;
import com.code972.hebmorph.datastructures.DictRadix;


import java.io.*;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;


public final class Loader {
    protected List<String> dmasks;
    protected final boolean loadMorphData;
    private int lookupLen;


    protected InputStream fdict, fprefixes;
    protected InputStream fdesc = null, fstem = null;


    public Loader(File hspellFolder, boolean loadMorphData) throws IOException {
        this(new FileInputStream(new File(hspellFolder, Constants.sizesFile)), new FileInputStream(new File(hspellFolder, Constants.dmaskFile)),
                new FileInputStream(new File(hspellFolder, Constants.dictionaryFile)), new FileInputStream(new File(hspellFolder, Constants.prefixesFile)),
                new FileInputStream(new File(hspellFolder, Constants.descFile)), new FileInputStream(new File(hspellFolder, Constants.stemsFile)), loadMorphData);


        if (!hspellFolder.exists() || !hspellFolder.isDirectory())
            throw new IllegalArgumentException("Invalid hspell data folder provided");
    }


    /**
     *
     * @param classloader
     * @param hspellFolder      resources folder in which the hspell data is in; must end with /
     * @param loadMorphData
     * @throws IOException
     */
    public Loader(final ClassLoader classloader, final String hspellFolder, final boolean loadMorphData) throws IOException {
        this(classloader.getResourceAsStream(hspellFolder + Constants.sizesFile), classloader.getResourceAsStream(hspellFolder + Constants.dmaskFile),
                classloader.getResourceAsStream(hspellFolder + Constants.dictionaryFile), classloader.getResourceAsStream(hspellFolder + Constants.prefixesFile),
                classloader.getResourceAsStream(hspellFolder + Constants.descFile), classloader.getResourceAsStream(hspellFolder + Constants.stemsFile), loadMorphData);
    }


    public Loader(InputStream sizesFile, InputStream dmasksFile, InputStream dictFile, InputStream prefixesFile, InputStream descFile, InputStream stemsFile, boolean loadMorphData) throws IOException {
        fdict = new GZIPInputStream(dictFile);
        fprefixes = new GZIPInputStream(prefixesFile);
        this.loadMorphData = loadMorphData;


        if (loadMorphData) {
            dmasks = new ArrayList<>();
            boolean foundStartLine = false;
            String line;
            BufferedReader reader = new BufferedReader(new InputStreamReader(dmasksFile));
            while ((line = reader.readLine()) != null) {
                if (!foundStartLine) {
                    if (line.contains("dmasks[]")) {
                        foundStartLine = true;
                    }
                    continue;
                }
                dmasks.add(line);
            }
            reader.close();


            lookupLen = getWordCountInHSpellFolder(sizesFile);
            fdesc = new GZIPInputStream(descFile);
            fstem = new GZIPInputStream(stemsFile);
        }
    }
  
  public DictRadix<MorphData> loadDictionaryFromHSpellData() throws IOException {


    if (loadMorphData) {
            // Load the count of morphological data slots required
      final String lookup[] = new String[lookupLen + 1];


            try {
                final char[] sbuf = new char[Constants.MaxWordLength];
                int c = 0, n, slen = 0, i = 0;
                while ((c = fdict.read()) > -1) {
                    if ((c >= '0') && (c <= '9')) { // No conversion required for chars < 0xBE
                        // new word - finalize and save old word
                        lookup[i++] = new String(sbuf, 0, slen);


                        // and read how much to go back
                        n = 0;
                        do {
                            // base 10...
                            n *= 10;
                            n += (c - '0');
                        } while (((c = fdict.read()) > -1) && (c >= '0') && (c <= '9'));
                        slen -= n;
                    }
                    sbuf[slen++] = ISO8859_To_Unicode(c);
                }
            } finally {
                if (fdict != null) try { fdict.close(); } catch (IOException ignored) {}
            }


            final DictRadix<MorphData> ret = new DictRadix<MorphData>();
            try {
                for (int i = 0; lookup[i] != null; i++) {
                    MorphData data = new MorphData();
                    data.setPrefixes((short) fprefixes.read()); // Read prefix hint byte
                    data.setDescFlags(readDescFile(fdesc));


                    final List<Integer> stemReferences = readStemFile(fstem);
                    final String[] lemmas = new String[stemReferences.size()];
                    int stemPosition = 0;
                    for (int r : stemReferences) {
                        // This is a bypass for the psuedo-stem "שונות", as defined by hspell
                        // TODO: Try looking into changing this in hspell itself
                        if (lookup[r].equals("שונות") && !lookup[r].equals(lookup[i])) {
                            lemmas[stemPosition++] = null;
                        } else {
                            lemmas[stemPosition++] = lookup[r];
                        }
                    }
                    data.setLemmas(lemmas);
                    ret.addNode(lookup[i], data);
                }
            } finally {
                if (fprefixes != null) try { fprefixes.close(); } catch (IOException ignored) {}
                if (fdesc != null) try { fdesc.close(); } catch (IOException ignored) {}
                if (fstem != null) try { fstem.close(); } catch (IOException ignored) {}
            }


      return ret;


    } else { // Use optimized version for loading HSpell's dictionary files
      DictRadix<MorphData> ret = new DictRadix<MorphData>();


            InputStream fprefixes = null, fdict = null;
            try {
                final char[] sbuf = new char[Constants.MaxWordLength];
                int c = 0, n, slen = 0;
                while ((c = fdict.read()) > -1) {
                    if ((c >= '0') && (c <= '9')) { // No conversion required for chars < 0xBE
                        // new word - finalize old word first (set value)
                        sbuf[slen] = '\0';


                        // TODO: Avoid creating new MorphData object, and enhance DictRadix to store
                        // the prefixes mask in the node itself
                        MorphData data = new MorphData();
                        data.setPrefixes((short) fprefixes.read()); // Read prefix hint byte
                        ret.addNode(sbuf, data);


                        // and read how much to go back
                        n = 0;
                        do {
                            // base 10...
                            n *= 10;
                            n += (c - '0');
                        } while (((c = fdict.read()) > -1) && (c >= '0') && (c <= '9'));
                        slen -= n;
                    }
                    sbuf[slen++] = ISO8859_To_Unicode(c);
                }


            } finally {
                if (fprefixes != null) try { fprefixes.close(); } catch (IOException ignored) {}
                if (fdict != null) try { fdict.close(); } catch (IOException ignored) {}
            }


      return ret;
    }
  }


    public static int getWordCountInHSpellFolder(File path) throws IOException {
        return getWordCountInHSpellFolder(new FileInputStream(new File(path, Constants.sizesFile)));
    }


    public static int getWordCountInHSpellFolder(InputStream inputStream) throws IOException {
        final BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, Charset.defaultCharset()));
        reader.readLine();
        final String sizes = reader.readLine();
        reader.close();


        int tmp;
        tmp = sizes.indexOf(' ', sizes.indexOf('\n'));
        tmp = Integer.parseInt(sizes.substring(tmp + 1).trim());
        return tmp - 1; // hspell stores the actual word count + 1
    }


    private int bufPos = 0;
    private final int[] buf = new int[5];


    private final java.util.ArrayList<Integer> wordMasks = new java.util.ArrayList<Integer>();
    final Integer[] readDescFile(InputStream fdesc) throws IOException {
        while ((buf[bufPos] = fdesc.read()) > -1) {
            // Break on EOL or EOF
            if ((buf[bufPos] == '\n') || (buf[bufPos] == 0))
            {
                bufPos = 0;
                Integer[] ret = wordMasks.toArray(new Integer[wordMasks.size()]);
                wordMasks.clear();
                return ret;
            }
            bufPos++;
            if (bufPos % 2 == 0) {
                int i = buf[0] - 'A' + (buf[1] - 'A') * 26;
                wordMasks.add(Integer.valueOf(dmasks.get(i).substring(0, dmasks.get(i).length() - 1)));
                bufPos = 0;
                continue;
            }
        }
        return null;
    }


    // Note: What HSpell call "stems", which we define as lemmas
    private final java.util.ArrayList<Integer> wordStems = new java.util.ArrayList<Integer>();
    final List<Integer> readStemFile(InputStream fstem) throws IOException {
        wordStems.clear();
        while ((buf[bufPos] = fstem.read()) > -1) {
            // Break on EOL or EOF
            if ((buf[bufPos] == '\n') || (buf[bufPos] == 0)) {
                bufPos = 0;
                return wordStems;
            }


            bufPos++;
            if (bufPos % 3 == 0) {
                wordStems.add(buf[0] - 33 + (buf[1] - 33) * 94 + (buf[2] - 33) * 94 * 94);
                bufPos = 0;
                continue;
            }
        }
        return null;
    }


  // Mapping is based on
  // http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-8.TXT
  // 0xDF, 0xFD, 0xFE aren't converted
  private static char ISO8859_To_Unicode(int c) {
    if ((c >= 0xE0) && (c <= 0xFA))
    {
      return (char)(c + 0x4F0);
    }
    else if (c <= 0xBE)
    {
      return (char)c;
    }
    return ' ';
  }


    private final static Integer[] descFlags_noun;
    private final static Integer[] descFlags_person_name;
    private final static Integer[] descFlags_place_name;
    private final static Integer[] descFlags_empty;
    static {
        descFlags_noun = new Integer[] { 69 };
        descFlags_person_name = new Integer[] { 262145 };
        descFlags_place_name = new Integer[] { 262153 };
        descFlags_empty = new Integer[] { 0 };
    }
    public static DictRadix<MorphData> loadCustomWords(final InputStream customWordsStream, final DictRadix<MorphData> dictRadix) throws IOException {
        if (customWordsStream == null)
            return null;


        final BufferedReader input = new BufferedReader(new InputStreamReader(customWordsStream, Charset.forName("UTF-8")));
        final Hashtable<String, String> secondPass = new Hashtable<>();
        final DictRadix<MorphData> custom = new DictRadix<>();
        String line;
        while ((line = input.readLine()) != null) {
            String[] cells = line.split(" ");
            if (cells.length < 2)
                continue;


            MorphData md = null;
            switch (cells[1]) {
                case "שםעצם":
                    md = new MorphData();
                    md.setPrefixes((short) 63);
                    md.setLemmas(new String[]{cells[0]});
                    md.setDescFlags(descFlags_noun);
                    break;
                case "שםחברה":
                case "שםפרטי":
                    md = new MorphData();
                    md.setPrefixes((short) 8);
                    md.setLemmas(new String[]{cells[0]});
                    md.setDescFlags(descFlags_person_name);
                    break;
                case "שםמקום":
                    md = new MorphData();
                    md.setPrefixes((short) 8);
                    md.setLemmas(new String[]{cells[0]});
                    md.setDescFlags(descFlags_place_name);
                    break;
                case "שםמדויק":
                    md = new MorphData();
                    md.setPrefixes((short) 0);
                    md.setLemmas(new String[]{cells[0]});
                    md.setDescFlags(descFlags_empty);
                    break;
            }


            if (md == null) { // allow to associate new entries with other custom entries
                try {
                    md = custom.lookup(cells[1], false);
                } catch (IllegalArgumentException ignored_ex) {
                }
            }


            if (md == null) {
                try {
                    md = dictRadix.lookup(cells[1], false);
                } catch (IllegalArgumentException ignored_ex) {
                }
            }


            if (md != null) {
                custom.addNode(cells[0], md);
            } else {
                secondPass.put(cells[0], cells[1]);
            }
        }


        for (final Map.Entry<String, String> entry : secondPass.entrySet()) {
            try {
                custom.lookup(entry.getKey(), false);
                continue; // we already stored this word somehow
            } catch (IllegalArgumentException expected_ex) {
            }


            try {
                final MorphData md = custom.lookup(entry.getValue(), false);
                if (md != null) custom.addNode(entry.getKey(), md);
            } catch (IllegalArgumentException ignored_ex) {
            }
        }


        return custom;
    }
}
Source Code of com.code972.hebmorph.hspell.Loader

Related Classes of com.code972.hebmorph.hspell.Loader