// Copyright 2006 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.enterprise.connector.pusher;
import com.google.common.annotations.VisibleForTesting;
import com.google.enterprise.connector.logging.NDC;
import com.google.enterprise.connector.manager.Context;
import com.google.enterprise.connector.spi.Document;
import com.google.enterprise.connector.spi.RepositoryDocumentException;
import com.google.enterprise.connector.spi.RepositoryException;
import com.google.enterprise.connector.spi.SpiConstants;
import com.google.enterprise.connector.spi.SpiConstants.FeedType;
import com.google.enterprise.connector.traversal.FileSizeLimitInfo;
import com.google.enterprise.connector.util.filter.DocumentFilterFactory;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.LinkedList;
import java.util.ListIterator;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.FutureTask;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* Class to generate xml feed for a document from the Document and send it
* to GSA.
*/
public class DocPusher implements Pusher {
private static final Logger LOGGER =
Logger.getLogger(DocPusher.class.getName());
/**
* Separate Logger for Feed Logging.
*/
private static final Logger FEED_WRAPPER_LOGGER =
Logger.getLogger(LOGGER.getName() + ".FEED_WRAPPER");
private static final Logger FEED_LOGGER =
Logger.getLogger(FEED_WRAPPER_LOGGER.getName() + ".FEED");
private static final Level FEED_LOG_LEVEL = Level.FINER;
/**
* Configured maximum document size and maximum feed file size supported.
*/
private final FileSizeLimitInfo fileSizeLimit;
/**
* FeedConnection that is the sink for our generated XmlFeeds.
*/
private final FeedConnection feedConnection;
/**
* The {@link DocumentFilterFactory} is used to construct
* {@code Document} instances that act as filters on a source
* document. Document filters may add, remove, or modify
* {@code Properties}. The DocumentFilterFactory set here
* is typically a {@link DocumentFilterChain} - a chain of
* DocumentFilterFactory beans that is used to construct a
* Document manipulation pipeline.
*/
private final DocumentFilterFactory documentFilterFactory;
/**
* The Connector name that is the dataSource for this Feed.
*/
private final String connectorName;
/**
* ExcecutorService that submits a Feed to the GSA in a separate thread.
* This allows us to overlap I/O reading content from the Repository
* in the traversal thread, and submitting content to the GSA in
* a submitFeed thread.
*/
private final ExecutorService feedSender;
/**
* This is the list of outstanding asynchronous feed submissions.
*/
private final LinkedList<FutureTask<String>> submissions;
/**
* This is used to build up a multi-record feed. Documents are added to the
* feed until the size of the feed exceeds the FileSizeLimitInfo.maxFeedSize
* or we are finished with the batch of documents. The feed is then
* submitted to the feed connection.
*/
private XmlFeed xmlFeed = null;
/**
* This field is used to construct a feed record in parallel to the main feed
* InputStream construction. It is only used if the feed logging level is set
* to the appropriate level. It only exists during the time the main feed is
* being constructed. Once sufficient information has been appended to this
* buffer its contents will be logged and it will be nulled.
*/
private StringBuilder feedLog = null;
// For use by unit tests.
private String gsaResponse;
/**
* Creates a {@code DocPusher} object from the specified
* {@code feedConnection} and {@code connectorName}. The supplied
* {@link FileSizeLimitInfo} specifies constraints as to the size of a
* Document's content and the size of generated Feed files.
*
* @param feedConnection a FeedConnection
* @param connectorName The connector name that is the source of the feed
* @param fileSizeLimitInfo FileSizeLimitInfo constraints on document content
* and feed size.
* @param documentFilterFactory a {@link DocumentFilterFactory} that creates
* document processing filters.
*/
public DocPusher(FeedConnection feedConnection, String connectorName,
FileSizeLimitInfo fileSizeLimitInfo,
DocumentFilterFactory documentFilterFactory) {
this.feedConnection = feedConnection;
this.connectorName = connectorName;
this.fileSizeLimit = fileSizeLimitInfo;
this.documentFilterFactory = documentFilterFactory;
// Initialize background feed submission.
this.submissions = new LinkedList<FutureTask<String>>();
this.feedSender = Executors.newSingleThreadExecutor();
}
/**
* Return the Feed Logger.
*/
public static Logger getFeedLogger() {
return FEED_WRAPPER_LOGGER;
}
/**
* Gets the response from GSA when the feed is sent. For testing only.
*
* @return gsaResponse response from GSA.
*/
protected String getGsaResponse() {
return gsaResponse;
}
/**
* The {@code DocumentStore} parameter is ignored and may be null.
*
* @param document Document corresponding to the document.
* @param documentStore {@code DocumentStore} for recording document
* status. Ignored - may be {@code null}.
* @deprecated Use the overload without the {@code DocumentStore} parameter
*/
@SuppressWarnings("deprecation")
@Deprecated
public PusherStatus take(Document document,
com.google.enterprise.connector.database.DocumentStore documentStore)
throws PushException, FeedException, RepositoryException {
return take(document);
}
/**
* Takes a Document and sends a the feed to the GSA.
*
* @param document Document corresponding to the document.
* @return true if Pusher should accept more documents, false otherwise.
* @throws PushException if Pusher problem
* @throws FeedException if transient Feed problem
* @throws RepositoryDocumentException if fatal Document problem
* @throws RepositoryException if transient Repository problem
*/
@Override
public PusherStatus take(Document document)
throws PushException, FeedException, RepositoryException {
if (feedSender.isShutdown()) {
return PusherStatus.DISABLED;
}
checkSubmissions();
// Apply any configured Document filters to the document.
document = documentFilterFactory.newDocumentFilter(document);
FeedType feedType;
try {
feedType = DocUtils.getFeedType(document);
} catch (RuntimeException e) {
LOGGER.log(Level.WARNING,
"Rethrowing RuntimeException as RepositoryDocumentException", e);
throw new RepositoryDocumentException(e);
}
// All feeds in a feed file must be of the same type.
// If the feed would change type, send the feed off to the GSA
// and start a new one.
// TODO: Fix this check to allow ACLs in any type feed.
if (xmlFeed != null && !feedType.isCompatible(xmlFeed.getFeedType())) {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("A new feedType, " + feedType + ", requires a new feed for "
+ connectorName + ". Closing feed and sending to GSA.");
}
submitFeed();
}
if (xmlFeed == null) {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Creating new " + feedType + " feed for " + connectorName);
}
try {
startNewFeed(feedType);
} catch (OutOfMemoryError me) {
throw new PushException("Unable to allocate feed buffer. Try reducing"
+ " the maxFeedSize setting, reducing the number of connector"
+ " intances, or adjusting the JVM heap size parameters.", me);
}
}
boolean isThrowing = false;
int resetPoint = xmlFeed.size();
int resetCount = xmlFeed.getRecordCount();
try {
if (LOGGER.isLoggable(Level.FINER)) {
LOGGER.log(Level.FINER, "DOCUMENT: Adding document with docid={0} and "
+ "searchurl={1} from connector {2} to feed.", new Object[] {
DocUtils.getOptionalString(document, SpiConstants.PROPNAME_DOCID),
DocUtils.getOptionalString(document,
SpiConstants.PROPNAME_SEARCHURL),
connectorName});
}
// Add this document to the feed.
xmlFeed.addRecord(document);
// If the feed is full, send it off to the GSA.
if (xmlFeed.isFull() || lowMemory()) {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Feed for " + connectorName + " has grown to "
+ xmlFeed.size() + " bytes. Closing feed and sending to GSA.");
}
submitFeed();
return getPusherStatus();
}
// Indicate that this Pusher may accept more documents.
return PusherStatus.OK;
} catch (OutOfMemoryError me) {
resetFeed(resetPoint, resetCount);
throw new PushException("Out of memory building feed, retrying.", me);
} catch (RuntimeException e) {
resetFeed(resetPoint, resetCount);
LOGGER.log(Level.WARNING,
"Rethrowing RuntimeException as RepositoryDocumentException", e);
throw new RepositoryDocumentException(e);
} catch (RepositoryDocumentException rde) {
// Skipping this document, remove it from the feed.
resetFeed(resetPoint, resetCount);
throw rde;
} catch (IOException ioe) {
LOGGER.log(Level.SEVERE, "IOException while reading: skipping", ioe);
resetFeed(resetPoint, resetCount);
Throwable t = ioe.getCause();
isThrowing = true;
if (t != null && (t instanceof RepositoryException)) {
throw (RepositoryException) t;
} else {
throw new RepositoryDocumentException("I/O error reading data", ioe);
}
}
}
/** Rolls back a feed to the reset point. */
private void resetFeed(int resetPoint, int resetCount) {
xmlFeed.reset(resetPoint);
xmlFeed.setRecordCount(resetCount);
}
/**
* Finish a feed. No more documents are anticipated.
* If there is an outstanding feed file, submit it to the GSA.
*
* @throws PushException if Pusher problem
* @throws FeedException if transient Feed problem
* @throws RepositoryException
*/
@Override
public void flush() throws PushException, FeedException, RepositoryException {
checkSubmissions();
if (!feedSender.isShutdown()) {
if (xmlFeed != null) {
LOGGER.fine("Flushing accumulated feed to GSA");
submitFeed();
}
feedSender.shutdown();
}
while (!feedSender.isTerminated()) {
try {
feedSender.awaitTermination(10, TimeUnit.SECONDS);
} catch (InterruptedException ie) {
if (checkSubmissions() > 0) {
throw new FeedException("Interrupted while waiting for feeds.");
}
}
}
checkSubmissions();
}
/**
* Cancels any feed being constructed. Any accumulated feed data is lost.
*/
@Override
public void cancel() {
// Discard any feed under construction.
if (xmlFeed != null) {
LOGGER.fine("Discarding accumulated feed for " + connectorName);
xmlFeed = null;
}
if (feedLog != null) {
feedLog = null;
}
// Cancel any feeds under asynchronous submission.
feedSender.shutdownNow();
}
@Override
public PusherStatus getPusherStatus()
throws PushException, FeedException, RepositoryException {
// Is Pusher shutdown?
if (feedSender.isShutdown()) {
return PusherStatus.DISABLED;
}
// If we are running low on memory, don't start another feed -
// tell the Traverser to finish this batch.
if (lowMemory()) {
return PusherStatus.LOW_MEMORY;
}
// If the number of feeds waiting to be sent has backed up,
// tell the Traverser to finish this batch.
if (checkSubmissions() > 10) {
return PusherStatus.LOCAL_FEED_BACKLOG;
} else if (feedConnection.isBacklogged()) {
return PusherStatus.GSA_FEED_BACKLOG;
}
// Indicate that this Pusher may accept more documents.
return PusherStatus.OK;
}
/**
* Checks on asynchronously submitted feeds to see if they completed
* or failed. If any of the submissions failed, throw an Exception.
*
* @return number if items remaining in the submissions list
*/
@VisibleForTesting
int checkSubmissions()
throws PushException, FeedException, RepositoryException {
int count = 0; // Count of outstanding items in the list.
synchronized(submissions) {
ListIterator<FutureTask<String>> iter = submissions.listIterator();
while (iter.hasNext()) {
FutureTask<String> future = iter.next();
if (future.isDone()) {
iter.remove();
try {
gsaResponse = future.get();
} catch (InterruptedException ie) {
// Shouldn't happen if isDone.
} catch (ExecutionException ee) {
Throwable cause = ee.getCause();
if (cause == null) {
cause = ee;
}
if (cause instanceof PushException) {
throw (PushException) cause;
} else if (cause instanceof FeedException) {
throw (FeedException) cause;
} else if (cause instanceof RepositoryException) {
throw (RepositoryException) cause;
} else {
throw new FeedException("Error submitting feed", cause);
}
}
} else {
count++;
}
}
}
return count;
}
/**
* Checks for low available memory condition.
*
* @return true if free memory is running low.
*/
private boolean lowMemory() {
long threshold = ((fileSizeLimit.maxFeedSize() + fileSizeLimit.maxDocumentSize()) * 4) / 3;
Runtime rt = Runtime.getRuntime();
if ((rt.maxMemory() - (rt.totalMemory() - rt.freeMemory())) < threshold) {
rt.gc();
if ((rt.maxMemory() - (rt.totalMemory() - rt.freeMemory())) < threshold) {
return true;
}
}
return false;
}
/**
* Allocates initial memory for a new XmlFeed and feed logger.
*
* @param feedType
*/
private void startNewFeed(FeedType feedType) throws PushException {
// Allocate a buffer to construct the feed log.
try {
if (FEED_LOGGER.isLoggable(FEED_LOG_LEVEL) && feedLog == null) {
feedLog = new StringBuilder(256 * 1024);
feedLog.append("Records generated for ").append(feedType);
feedLog.append(" feed of ").append(connectorName).append(":\n");
}
} catch (OutOfMemoryError me) {
throw new OutOfMemoryError(
"Unable to allocate feed log buffer for connector " + connectorName);
}
long feedSize = fileSizeLimit.maxFeedSize();
try {
try {
// Allocate XmlFeed of the target size.
xmlFeed = new XmlFeed(connectorName, feedType, fileSizeLimit, feedLog,
feedConnection);
} catch (OutOfMemoryError me) {
// We shouldn't even have gotten this far under a low memory condition.
// However, try to allocate a tiny feed buffer. It should fill up on
// the first document, forcing it to be submitted. DocPusher.take()
// should then return a signal to the caller to terminate the batch.
LOGGER.warning("Insufficient memory available to allocate an optimally"
+ " sized feed - retrying with a much smaller feed allocation.");
feedSize = 1024;
FileSizeLimitInfo newLimit = new FileSizeLimitInfo();
newLimit.setMaxFeedSize(feedSize);
newLimit.setMaxDocumentSize(fileSizeLimit.maxDocumentSize());
try {
xmlFeed = new XmlFeed(connectorName, feedType, newLimit, feedLog,
feedConnection);
} catch (OutOfMemoryError oome) {
throw new OutOfMemoryError(
"Unable to allocate feed buffer for connector " + connectorName);
}
}
} catch (IOException ioe) {
throw new PushException("Error creating feed", ioe);
}
LOGGER.fine("Allocated a new feed of size " + feedSize);
return;
}
/**
* Takes the accumulated XmlFeed and sends the feed to the GSA.
*
* @throws PushException if Pusher problem
* @throws FeedException if transient Feed problem
* @throws RepositoryException
*/
private void submitFeed()
throws PushException, FeedException, RepositoryException {
if (xmlFeed == null) {
return;
}
final XmlFeed feed = xmlFeed;
xmlFeed = null;
final String logMessage;
if (feedLog != null) {
logMessage = feedLog.toString();
feedLog = null;
} else {
logMessage = null;
}
try {
feed.close();
} catch (IOException ioe) {
throw new PushException("Error closing feed", ioe);
}
try {
// Send the feed to the GSA in a separate thread.
FutureTask<String> future = new FutureTask<String> (
new Callable<String>() {
public String call()
throws PushException, FeedException, RepositoryException {
try {
NDC.push("Feed " + feed.getDataSource());
return submitFeed(feed, logMessage);
} finally {
NDC.remove();
}
}
}
);
feedSender.execute(future);
// Add the future to list of outstanding submissions.
synchronized(submissions) {
submissions.add(future);
}
} catch (RejectedExecutionException ree) {
throw new FeedException("Asynchronous feed was rejected. ", ree);
}
}
/**
* Takes the supplied XmlFeed and sends that feed to the GSA.
*
* @param feed an XmlFeed
* @param logMessage a Feed Log message
* @return response String from GSA
* @throws PushException if Pusher problem
* @throws FeedException if transient Feed problem
* @throws RepositoryException
*/
private String submitFeed(XmlFeed feed, String logMessage)
throws PushException, FeedException, RepositoryException {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Submitting " + feed.getFeedType() + " feed for "
+ feed.getDataSource() + " to the GSA. " + feed.getRecordCount()
+ " records totaling " + feed.size() + " bytes.");
}
// Write the generated feedLog message to the feed logger.
if (logMessage != null && FEED_LOGGER.isLoggable(FEED_LOG_LEVEL)) {
FEED_LOGGER.log(FEED_LOG_LEVEL, logMessage);
}
// Write the Feed to the TeedFeedFile, if one was specified.
String teedFeedFilename = Context.getInstance().getTeedFeedFile();
if (teedFeedFilename != null) {
boolean isThrowing = false;
OutputStream os = null;
try {
os = new FileOutputStream(teedFeedFilename, true);
feed.writeTo(os);
} catch (IOException e) {
isThrowing = true;
throw new FeedException("Cannot write to file: " + teedFeedFilename, e);
} finally {
if (os != null) {
try {
os.close();
} catch (IOException e) {
if (!isThrowing) {
throw new FeedException(
"Cannot write to file: " + teedFeedFilename, e);
}
}
}
}
}
String gsaResponse = feedConnection.sendData(feed);
if (!gsaResponse.equals(GsaFeedConnection.SUCCESS_RESPONSE)) {
String eMessage = gsaResponse;
if (GsaFeedConnection.UNAUTHORIZED_RESPONSE.equals(gsaResponse)) {
eMessage += ": Client is not authorized to send feeds. Make "
+ "sure the GSA is configured to trust feeds from your host.";
}
if (GsaFeedConnection.INTERNAL_ERROR_RESPONSE.equals(gsaResponse)) {
eMessage += ": Check GSA status or feed format.";
}
throw new PushException(eMessage);
}
return gsaResponse;
}
}