/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.tpch;
import com.facebook.presto.execution.TaskId;
import com.facebook.presto.metadata.ColumnFileHandle;
import com.facebook.presto.operator.OperatorContext;
import com.facebook.presto.operator.Page;
import com.facebook.presto.operator.RecordProjectOperator;
import com.facebook.presto.operator.TaskContext;
import com.facebook.presto.serde.BlocksFileEncoding;
import com.facebook.presto.spi.ColumnMetadata;
import com.facebook.presto.sql.analyzer.Session;
import com.facebook.presto.util.DelimitedRecordSet;
import com.facebook.presto.util.Threads;
import com.google.common.base.Splitter;
import com.google.common.base.Throwables;
import com.google.common.hash.Hashing;
import com.google.common.io.ByteStreams;
import com.google.common.io.Files;
import com.google.common.io.InputSupplier;
import com.google.common.io.Resources;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.jar.JarFile;
import static com.google.common.base.Charsets.UTF_8;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.io.CharStreams.newReaderSupplier;
/**
* Extracts TPCH data into serialized column file formats.
* It will also cache the extracted columns in the local file system to help mitigate the cost of the operation.
*/
public class GeneratingTpchDataFileLoader
implements TpchDataFileLoader
{
private final TableInputSupplierFactory tableInputSupplierFactory;
private final File cacheDirectory;
public GeneratingTpchDataFileLoader(TableInputSupplierFactory tableInputSupplierFactory, File cacheDirectory)
{
checkNotNull(tableInputSupplierFactory, "tableInputStreamProvider is null");
checkNotNull(cacheDirectory, "cacheDirectory is null");
checkArgument(!cacheDirectory.exists() || cacheDirectory.isDirectory(), "cacheDirectory must be a directory");
this.tableInputSupplierFactory = tableInputSupplierFactory;
this.cacheDirectory = cacheDirectory;
}
public GeneratingTpchDataFileLoader(TableInputSupplierFactory tableInputSupplierFactory, String cacheDirectoryName)
{
this(tableInputSupplierFactory, new File(checkNotNull(cacheDirectoryName, "cacheDirectoryName is null")));
}
public GeneratingTpchDataFileLoader(String cacheDirectoryName)
{
this(autoSelectTableInputStreamProvider(), cacheDirectoryName);
}
public GeneratingTpchDataFileLoader()
{
this(System.getProperty("tpchCacheDir", "/tmp/tpchdatacache"));
}
private interface TableInputSupplierFactory
{
InputSupplier<InputStream> getInputSupplier(String tableName);
}
private static class JarTableInputSupplierFactory
implements TableInputSupplierFactory
{
private final String jarFileName;
private JarTableInputSupplierFactory(String jarFileName)
{
this.jarFileName = checkNotNull(jarFileName, "jarFileName is null");
}
@Override
public InputSupplier<InputStream> getInputSupplier(final String tableName)
{
checkNotNull(tableName, "tableFileName is null");
return new InputSupplier<InputStream>()
{
@Override
public InputStream getInput()
throws IOException
{
try {
JarFile jarFile = new JarFile(jarFileName);
return jarFile.getInputStream(jarFile.getJarEntry(createTableFileName(tableName)));
}
catch (IOException e) {
throw Throwables.propagate(e);
}
}
};
}
}
private static class ResourcesTableInputSupplierFactory
implements TableInputSupplierFactory
{
@Override
public InputSupplier<InputStream> getInputSupplier(String tableName)
{
checkNotNull(tableName, "tableFileName is null");
return Resources.newInputStreamSupplier(Resources.getResource(createTableFileName(tableName)));
}
}
private static TableInputSupplierFactory autoSelectTableInputStreamProvider()
{
// First check if a data jar file has been manually specified
final String tpchDataJarFileOverride = System.getProperty("tpchDataJar");
if (tpchDataJarFileOverride != null) {
return new JarTableInputSupplierFactory(tpchDataJarFileOverride);
}
// Otherwise fall back to the default in resources if one is available
else {
return new ResourcesTableInputSupplierFactory();
}
}
@Override
public File getDataFile(TpchTableHandle tableHandle, TpchColumnHandle columnHandle, BlocksFileEncoding encoding)
{
checkNotNull(tableHandle, "tableHandle is null");
checkNotNull(columnHandle, "columnHandle is null");
checkNotNull(encoding, "encoding is null");
String tableName = tableHandle.getTableName();
ExecutorService executor = Executors.newCachedThreadPool(Threads.daemonThreadsNamed("tpch-generate-%s"));
try {
String hash = ByteStreams.hash(ByteStreams.slice(tableInputSupplierFactory.getInputSupplier(tableName), 0, 1024 * 1024), Hashing.murmur3_32()).toString();
File cachedFile = new File(new File(cacheDirectory, tableName + "-" + hash), "new-" + createFileName(columnHandle, encoding));
if (cachedFile.exists()) {
return cachedFile;
}
Files.createParentDirs(cachedFile);
InputSupplier<InputStream> inputSupplier = tableInputSupplierFactory.getInputSupplier(tableName);
ColumnMetadata columnMetadata = new TpchMetadata().getColumnMetadata(tableHandle, columnHandle);
DelimitedRecordSet records = new DelimitedRecordSet(newReaderSupplier(inputSupplier, UTF_8), Splitter.on("|"), columnMetadata);
Session session = new Session("user", "source", "catalog", "schema", "address", "agent");
OperatorContext operatorContext = new TaskContext(new TaskId("query", "stage", "task"), executor, session)
.addPipelineContext(true, true)
.addDriverContext()
.addOperatorContext(0, "tpch-generate");
RecordProjectOperator source = new RecordProjectOperator(operatorContext, records);
ColumnFileHandle columnFileHandle = ColumnFileHandle.builder(0)
.addColumn(columnHandle, cachedFile, encoding)
.build();
while (!source.isFinished()) {
Page page = source.getOutput();
if (page != null) {
columnFileHandle.append(page);
}
}
columnFileHandle.commit();
return cachedFile;
}
catch (IOException e) {
throw Throwables.propagate(e);
}
finally {
executor.shutdownNow();
}
}
private static String createTableFileName(String tableName)
{
return tableName + ".tbl";
}
private static String createFileName(TpchColumnHandle columnHandle, BlocksFileEncoding encoding)
{
return String.format("column%d.%s_%s.data", columnHandle.getFieldIndex(), columnHandle.getType(), encoding.getName());
}
}