/****************************************************************
* Licensed to the Apache Software Foundation (ASF) under one *
* or more contributor license agreements. See the NOTICE file *
* distributed with this work for additional information *
* regarding copyright ownership. The ASF licenses this file *
* to you under the Apache License, Version 2.0 (the *
* "License"); you may not use this file except in compliance *
* with the License. You may obtain a copy of the License at *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, *
* software distributed under the License is distributed on an *
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
* KIND, either express or implied. See the License for the *
* specific language governing permissions and limitations *
* under the License. *
****************************************************************/
package org.apache.james.transport.mailets;
import java.io.BufferedReader;
import java.io.StringReader;
import java.io.ByteArrayOutputStream;
import java.sql.Connection;
import java.util.Enumeration;
import javax.mail.internet.MimeMessage;
import javax.mail.Header;
import javax.mail.MessagingException;
import org.apache.avalon.cornerstone.services.datasources.DataSourceSelector;
import org.apache.avalon.excalibur.datasource.DataSourceComponent;
import org.apache.avalon.framework.service.ServiceManager;
import org.apache.james.Constants;
import org.apache.mailet.GenericMailet;
import org.apache.mailet.Mail;
import org.apache.james.util.JDBCUtil;
import org.apache.james.util.JDBCBayesianAnalyzer;
/**
* <P>Feeds ham OR spam messages to train the {@link BayesianAnalysis} mailet.</P>
*
* <P>The new token frequencies will be stored in a JDBC database.</P>
*
* <P>Sample configuration:</P>
* <PRE><CODE>
* <processor name="root">
*
* <mailet match="RecipientIs=not.spam@thisdomain.com" class="BayesianAnalysisFeeder">
* <repositoryPath> db://maildb </repositoryPath>
* <feedType>ham</feedType>
* <!--
* Set this to the maximum message size (in bytes) that a message may have
* to be analyzed (default is 100000).
* -->
* <maxSize>100000</maxSize>
* </mailet>
*
* <mailet match="RecipientIs=spam@thisdomain.com" class="BayesianAnalysisFeeder">
* <repositoryPath> db://maildb </repositoryPath>
* <feedType>spam</feedType>
* <!--
* Set this to the maximum message size (in bytes) that a message may have
* to be analyzed (default is 100000).
* -->
* <maxSize>100000</maxSize>
* </mailet>
*
* <processor>
* </CODE></PRE>
*
* <P>The previous example will allow the user to send messages to the server
* and use the recipient email address as the indicator for whether the message
* is ham or spam.</P>
*
* <P>Using the example above, send good messages (ham not spam) to the email
* address "not.spam@thisdomain.com" to pump good messages into the feeder,
* and send spam messages (spam not ham) to the email
* address "spam@thisdomain.com" to pump spam messages into the feeder.</P>
*
* <p>The bayesian database tables will be updated during the training reflecting
* the new data</p>
*
* <P>At the end the mail will be destroyed (ghosted).</P>
*
* <P><B>The correct approach is to send the original ham/spam message as an attachment
* to another message sent to the feeder; all the headers of the enveloping message
* will be removed and only the original message's tokens will be analyzed.</B></P>
*
* <p>After a training session, the frequency <i>Corpus</i> used by <CODE>BayesianAnalysis</CODE>
* must be rebuilt from the database, in order to take advantage of the new token frequencies.
* Every 10 minutes a special thread in the <CODE>BayesianAnalysis</CODE> mailet will check if any
* change was made to the database, and rebuild the corpus if necessary.</p>
*
* <p>Only one message at a time is scanned (the database update activity is <I>synchronized</I>)
* in order to avoid too much database locking,
* as thousands of rows may be updated just for one message fed.</p>
* @see BayesianAnalysis
* @see org.apache.james.util.BayesianAnalyzer
* @see org.apache.james.util.JDBCBayesianAnalyzer
* @version CVS $Revision: $ $Date: $
* @since 2.3.0
*/
public class BayesianAnalysisFeeder
extends GenericMailet {
/**
* The JDBCUtil helper class
*/
private final JDBCUtil theJDBCUtil = new JDBCUtil() {
protected void delegatedLog(String logString) {
log("BayesianAnalysisFeeder: " + logString);
}
};
/**
* The JDBCBayesianAnalyzer class that does all the work.
*/
private JDBCBayesianAnalyzer analyzer = new JDBCBayesianAnalyzer() {
protected void delegatedLog(String logString) {
log("BayesianAnalysisFeeder: " + logString);
}
};
private DataSourceComponent datasource;
private String repositoryPath;
private String feedType;
/**
* Return a string describing this mailet.
*
* @return a string describing this mailet
*/
public String getMailetInfo() {
return "BayesianAnalysisFeeder Mailet";
}
/**
* Holds value of property maxSize.
*/
private int maxSize = 100000;
/**
* Getter for property maxSize.
* @return Value of property maxSize.
*/
public int getMaxSize() {
return this.maxSize;
}
/**
* Setter for property maxSize.
* @param maxSize New value of property maxSize.
*/
public void setMaxSize(int maxSize) {
this.maxSize = maxSize;
}
/**
* Mailet initialization routine.
* @throws MessagingException if a problem arises
*/
public void init() throws MessagingException {
repositoryPath = getInitParameter("repositoryPath");
if (repositoryPath == null) {
throw new MessagingException("repositoryPath is null");
}
feedType = getInitParameter("feedType");
if (feedType == null) {
throw new MessagingException("feedType is null");
}
String maxSizeParam = getInitParameter("maxSize");
if (maxSizeParam != null) {
setMaxSize(Integer.parseInt(maxSizeParam));
}
log("maxSize: " + getMaxSize());
initDb();
}
private void initDb() throws MessagingException {
try {
ServiceManager serviceManager = (ServiceManager) getMailetContext().getAttribute(Constants.AVALON_COMPONENT_MANAGER);
// Get the DataSourceSelector block
DataSourceSelector datasources = (DataSourceSelector) serviceManager.lookup(DataSourceSelector.ROLE);
// Get the data-source required.
int stindex = repositoryPath.indexOf("://") + 3;
String datasourceName = repositoryPath.substring(stindex);
datasource = (DataSourceComponent) datasources.select(datasourceName);
} catch (Exception e) {
throw new MessagingException("Can't get datasource", e);
}
try {
analyzer.initSqlQueries(datasource.getConnection(), getMailetContext());
} catch (Exception e) {
throw new MessagingException("Exception initializing queries", e);
}
}
/**
* Scans the mail and updates the token frequencies in the database.
*
* The method is synchronized in order to avoid too much database locking,
* as thousands of rows may be updated just for one message fed.
*
* @param mail The Mail message to be scanned.
*/
public void service(Mail mail) {
boolean dbUpdated = false;
mail.setState(Mail.GHOST);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
Connection conn = null;
try {
MimeMessage message = mail.getMessage();
String messageId = message.getMessageID();
if (message.getSize() > getMaxSize()) {
log(messageId + " Feeding HAM/SPAM ignored because message size > " + getMaxSize() + ": " + message.getSize());
return;
}
clearAllHeaders(message);
message.writeTo(baos);
BufferedReader br = new BufferedReader(new StringReader(baos.toString()));
// this is synchronized to avoid concurrent update of the corpus
synchronized(JDBCBayesianAnalyzer.DATABASE_LOCK) {
conn = datasource.getConnection();
if (conn.getAutoCommit()) {
conn.setAutoCommit(false);
}
dbUpdated = true;
//Clear out any existing word/counts etc..
analyzer.clear();
if ("ham".equalsIgnoreCase(feedType)) {
log(messageId + " Feeding HAM");
//Process the stream as ham (not spam).
analyzer.addHam(br);
//Update storage statistics.
analyzer.updateHamTokens(conn);
} else {
log(messageId + " Feeding SPAM");
//Process the stream as spam.
analyzer.addSpam(br);
//Update storage statistics.
analyzer.updateSpamTokens(conn);
}
//Commit our changes if necessary.
if (conn != null && dbUpdated && !conn.getAutoCommit()) {
conn.commit();
dbUpdated = false;
log(messageId + " Training ended successfully");
JDBCBayesianAnalyzer.touchLastDatabaseUpdateTime();
}
}
} catch (java.sql.SQLException se) {
log("SQLException: "
+ se.getMessage());
} catch (java.io.IOException ioe) {
log("IOException: "
+ ioe.getMessage());
} catch (javax.mail.MessagingException me) {
log("MessagingException: "
+ me.getMessage());
} finally {
//Rollback our changes if necessary.
try {
if (conn != null && dbUpdated && !conn.getAutoCommit()) {
conn.rollback();
dbUpdated = false;
}
} catch (Exception e) {}
theJDBCUtil.closeJDBCConnection(conn);
}
}
private void clearAllHeaders(MimeMessage message) throws javax.mail.MessagingException {
Enumeration headers = message.getAllHeaders();
while (headers.hasMoreElements()) {
Header header = (Header) headers.nextElement();
try {
message.removeHeader(header.getName());
} catch (javax.mail.MessagingException me) {}
}
message.saveChanges();
}
}