Package org.jasen.core.engine

Source Code of org.jasen.core.engine.JasenTrainer

/*
* @(#)JasenTrainer.java  3/11/2004
*
* Copyright (c) 2004, 2005  jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
*   1. Redistributions of source code must retain the above copyright notice,
*      this list of conditions and the following disclaimer.
*
*   2. Redistributions in binary form must reproduce the above copyright
*      notice, this list of conditions and the following disclaimer in
*      the documentation and/or other materials provided with the distribution.
*
*   3. The names of the authors may not be used to endorse or promote products
*      derived from this software without specific prior written permission.
*
*   4. Any modification or additions to the software must be contributed back
*      to the project.
*
*   5. Any investigation or reverse engineering of source code or binary to
*      enable emails to bypass the filters, and hence inflict spam and or viruses
*      onto users who use or do not use jASEN could subject the perpetrator to
*      criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.engine;

import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;

import javax.mail.MessagingException;
import javax.mail.internet.MimeMessage;

import org.jasen.core.token.EmailTokenizer;
import org.jasen.error.EmptyErrorHandler;
import org.jasen.error.ErrorHandlerBroker;
import org.jasen.error.JasenException;
import org.jasen.interfaces.HTMLParser;
import org.jasen.interfaces.JasenMapStore;
import org.jasen.interfaces.JasenMessage;
import org.jasen.interfaces.MimeMessageParser;
import org.jasen.interfaces.MimeMessageTokenizer;
import org.jasen.interfaces.ParserData;

/**
* <P>
*   Trains the Jasen Engine and builds the JasenMap from a ham corpus and a spam corpus.
* </P>
* <p>
*   Training (and/or re-training) should be done regularly as new types of spam messages begin to appear
* </p>
* @author Jason Polites
*/
public class JasenTrainer
{
    private String spamCorpusPath;
    private String hamCorpusPath;
    private String storePath;

    private MimeMessageParser mimeParser;
    private MimeMessageTokenizer tokenizer;
    private Class htmlParserClass;

    private JasenMapStore store;

    private JasenMap map;

    private boolean load = false;

    private int errors = 0;

    /**
     *
     */
    public JasenTrainer() {
        super ();
    }

    /**
     * <p>
     * Simple file filter which just ensures the File objects listed are files and not folders
     * </p>
     */
    class TrainerFileFilter implements FileFilter {
        public boolean accept(File pathname) {
            return pathname.isFile();
        }
    }

    /**
     * Initialises the trainer
     * @throws JasenException
     */
    public void init() throws JasenException {

        InputStream in = null;

        try
        {
            in = getClass().getClassLoader().getResourceAsStream("JasenTrainer.properties");
            Properties props = new Properties();
            props.load(in);

            tokenizer = (MimeMessageTokenizer)Class.forName(props.getProperty("tokenizer")).newInstance();
            mimeParser = (MimeMessageParser)Class.forName(props.getProperty("mimeParser")).newInstance();
            htmlParserClass = Class.forName(props.getProperty("htmlParser"));
            store = (JasenMapStore)Class.forName(props.getProperty("store")).newInstance();

            int maxTokens = Integer.parseInt(props.getProperty("max-tokens"));
            int linguisticFailures = Integer.parseInt(props.getProperty("linguistic-failures"));

            tokenizer.setTokenLimit(maxTokens);

            // Set an empty error handler
            ErrorHandlerBroker.getInstance().setErrorHandler(new EmptyErrorHandler());

            if(tokenizer instanceof EmailTokenizer) {
                ((EmailTokenizer)tokenizer).setLinguisticLimit(linguisticFailures);
            }

            if(load) {
                map = store.load(storePath);
            }
            else
            {
                map = new JasenMap();
            }
        }
        catch (Exception e)
        {
            throw new JasenException(e);
        }
        finally
        {
            if(in != null) {
                try
                {
                    in.close();
                }
                catch (IOException ignore){}
            }
        }
    }

    /**
     * Trains the engine to produce the JasenMap
     * @throws JasenException
     * @see JasenMap
     */
    public void train() throws JasenException {
        errors = 0;

        try
        {
            System.out.println ("Jasen engine training commenced at " + new Date());
            System.out.println ("---------------------------------------------------------------");
            System.out.println ("Spam corpus: " + spamCorpusPath);
            System.out.println ("Ham corpus: " + hamCorpusPath);
            System.out.println ();

            // Train spam
            File spamFiles = new File(spamCorpusPath);
            File[] files = spamFiles.listFiles(new TrainerFileFilter());

            int observationsS = train(files, JasenMap.SPAM);

            map.setSpamObservations(observationsS);

            // Train ham
            File hamFiles = new File(hamCorpusPath);
            files = hamFiles.listFiles(new TrainerFileFilter());

            int observationsH = train(files, JasenMap.HAM);

            map.setHamObservations(observationsH);

            System.out.println ("Saving map...");
            store.save(map, storePath);
            System.out.println ("Training complete with " + errors + " errors");
            System.out.println ("Total mails scanned: " + (observationsS + observationsH));
        }
        catch (Exception e)
        {
            throw new JasenException(e);
        }
    }

    private int train(File[] files, int type) throws InstantiationException, IllegalAccessException {

        int count = 0;
        JasenMessage message = null;
        MimeMessage mm = null;
        String[] tokens = null;
        ParserData data = null;

        int counter = 1;

        HTMLParser htmlParser = null;

        System.out.println ("Scanning " + files.length + " files");
        for (int i = 0; i < files.length; i++)
        {
            try
            {
                htmlParser = (HTMLParser)htmlParserClass.newInstance();

                mm = getMimeMessage(files[i]);
                message = mimeParser.parse(mm);
                data = htmlParser.parse(mm, message, tokenizer);

                if(learn(data, type)) {
                    count++;
                }

                if((i / files.length) >= ((files.length/10)*counter)) {
                    System.out.print ((counter * 10) + "% ");
                    counter++;
                }
            }
            catch (Exception e)
            {
                errors++;
                ErrorHandlerBroker.getInstance().getErrorHandler().handleException(e);
            }
        }

        System.out.print ("100%");

        return count;
    }

    private boolean learn(ParserData data, int type) {
        String[] tokens = data.getMessageTokens();

        // We need to keep a log of the tokens we add so we don't add them twice
        // This is an arguable point, however technically the probability
        // calculations are only valid if we record the number of emails
        // containing the word, not the number of words found in total

        if(tokens != null) {

            List log = new LinkedList();
            String token = null;

            for (int i = 0; i < tokens.length; i++)
            {
                token = tokens[i].trim();

                if(!log.contains(token)) {
                    map.addToken(token, type);
                    log.add(token);
                }
            }

            return true;
        }
        else
        {
            return false;
        }

    }

    private MimeMessage getMimeMessage(File file) throws IOException, MessagingException {
        FileInputStream fin = null;
        MimeMessage mm = null;

        try
        {
            fin = new FileInputStream(file);
            mm = new MimeMessage(null, fin);
        }
        finally
        {
            if(fin != null) {
                try
                {
                    fin.close();
                }
                catch (IOException ignore){}
            }
        }

        return mm;
    }

    /**
     * Gets the local path to the folder containing the HAM corpus.
     * @return Either an absolute or classpath-relative path to the folder as a String.
     */
    public String getHamCorpusPath() {
        return hamCorpusPath;
    }
   
    /**
     * Sets the local path to the folder containind the HAM corpus.
     * @param hamCorpusPath Either an absolute or classpath-relative path to the folder as a String.
     */
    public void setHamCorpusPath(String hamCorpusPath) {
        this.hamCorpusPath = hamCorpusPath;
    }

    /**
     * Gets the JasenMap object produced as a result of a training run
     * @return Returns the map.
     * @see JasenMap
     */
    public JasenMap getMap() {
        return map;
    }

    /**
     * Sets the map object to be used in training.
     * @param map The map to set.
     */
    protected void setMap(JasenMap map) {
        this.map = map;
    }
   
    /**
     * Gets the MIME parser to be used during training.
     * @return Returns the mimeParser.
     */
    public MimeMessageParser getMimeParser() {
        return mimeParser;
    }
   
    /**
     * Sets the MIME parser to be used during training.
     * @param mimeParser The mimeParser to set.
     */
    public void setMimeParser(MimeMessageParser mimeParser) {
        this.mimeParser = mimeParser;
    }
   
   
    /**
     * Gets the local path to the folder containind the SPAM corpus.
     * @return Either an absolute or classpath-relative path to the folder as a String.
     */
    public String getSpamCorpusPath() {
        return spamCorpusPath;
    }
   
    /**
     * Sets the local path to the folder containind the SPAM corpus.
     * @param spamCorpusPath Either an absolute or classpath-relative path to the folder as a String.
     */
    public void setSpamCorpusPath(String spamCorpusPath) {
        this.spamCorpusPath = spamCorpusPath;
    }
   
   
    /**
     * Gets the store into which the map produced by the training run will be stored.
     * @return Returns the store.
     */
    public JasenMapStore getStore() {
        return store;
    }
   
   
    /**
     * Sets the store into which the map produced by the training run will be stored.
     * @param store The store to set.
     */
    public void setStore(JasenMapStore store) {
        this.store = store;
    }
   
    /**
     * Gets the tokenizer that will be used during training.
     * @return Returns the tokenizer.
     */
    public MimeMessageTokenizer getTokenizer() {
        return tokenizer;
    }
   
    /**
     * Sets the tokenizer that will be used during training.
     * @param tokenizer The tokenizer to set.
     */
    public void setTokenizer(MimeMessageTokenizer tokenizer) {
        this.tokenizer = tokenizer;
    }

    /**
     * Returns the value of the load option
     * @return True if the trainer was instructed to load a previously created map.  False otherwise
     */
    public boolean isLoad() {
        return load;
    }
   
    /**
     * Sets the load value for the trainer. 
     * @param load If true, the trainer will append data to an existing map.  Otherwise a new map will be created
     */
    public void setLoad(boolean load) {
        this.load = load;
    }
   
    /**
     * Gets the path to the local file system into which the final JasenMapStore will be saved.
     * <P>
     * This is only relevant to the DiskMapStore class, however the trainer assumes this anyway.
     * </P>
     * @return Either an absolute or classpath-relative path to the file as a String.
     */
    public String getStorePath() {
        return storePath;
    }
   
   
    /**
     * Sets the path to the local file system into which the final JasenMapStore will be saved.
     * <P>
     * This is only relevant to the DiskMapStore class, however the trainer assumes this anyway.
     * </P>
     * @param storePath Either an absolute or classpath-relative path to the file as a String.
     */
    public void setStorePath(String storePath) {
        this.storePath = storePath;
    }

}
TOP

Related Classes of org.jasen.core.engine.JasenTrainer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.