Package io.lumify.wikipedia.mapreduce

Source Code of io.lumify.wikipedia.mapreduce.WikipediaFileToMRFile

package io.lumify.wikipedia.mapreduce;

import io.lumify.core.util.LumifyLogger;
import io.lumify.core.util.LumifyLoggerFactory;
import io.lumify.wikipedia.RandomAccessFileInputStream;
import org.apache.commons.cli.*;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;

import java.io.*;
import java.text.DecimalFormat;

public class WikipediaFileToMRFile {
    private static final LumifyLogger LOGGER = LumifyLoggerFactory.getLogger(WikipediaFileToMRFile.class);
    private static final DecimalFormat NUMBER_FORMATTER = new DecimalFormat("#,###");

    public static void main(String[] args) throws ParseException, IOException {
        Options options = new Options();

        options.addOption(
                OptionBuilder
                        .withLongOpt("in")
                        .withDescription("Input file name")
                        .hasArg(true)
                        .withArgName("file")
                        .create("i")
        );
        options.addOption(
                OptionBuilder
                        .withLongOpt("out")
                        .withDescription("Output file name")
                        .hasArg(true)
                        .withArgName("file")
                        .create("o")
        );
        options.addOption(
                OptionBuilder
                        .withLongOpt("help")
                        .withDescription("Prints help")
                        .hasArg(false)
                        .create("h")
        );

        CommandLineParser parser = new GnuParser();
        CommandLine cmd = parser.parse(options, args);

        if (cmd.hasOption("help")) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp("run", options, true);
            System.exit(1);
            return;
        }

        if (!cmd.hasOption("in")) {
            System.err.println("in is required");
            System.exit(1);
            return;
        }
        String in = cmd.getOptionValue("in");

        if (!cmd.hasOption("out")) {
            System.err.println("out is required");
            System.exit(1);
            return;
        }
        String out = cmd.getOptionValue("out");

        new WikipediaFileToMRFile().run(in, out);
    }

    private void run(String inputFileName, String outputFileName) throws IOException {
        InputStream in;
        RandomAccessFile randomAccessFile = null;

        File inputFile = new File(inputFileName);
        if (!inputFile.exists()) {
            throw new RuntimeException("Could not find " + inputFileName);
        }

        if (inputFile.getName().endsWith("bz2")) {
            FileInputStream fileInputStream = new FileInputStream(inputFile);
            in = new BZip2CompressorInputStream(fileInputStream);
        } else {
            randomAccessFile = new RandomAccessFile(inputFile, "r");
            in = new RandomAccessFileInputStream(randomAccessFile);
        }

        File outputFile = new File(outputFileName);
        if (outputFile.exists()) {
            throw new RuntimeException("Output file already exists " + outputFileName);
        }
        OutputStream out = new FileOutputStream(outputFile);

        run(randomAccessFile, in, out);
    }

    private void run(RandomAccessFile randomAccessFile, InputStream in, OutputStream out) throws IOException {
        BufferedReader reader = new BufferedReader(new InputStreamReader(in));
        try {
            String line;
            int lineNumber = 0;
            int pageCount = 0;
            boolean foundStartPage = false;

            while ((line = reader.readLine()) != null) {
                if ((lineNumber % 100000) == 0) {
                    LOGGER.info("Processing line " + NUMBER_FORMATTER.format(lineNumber) + (randomAccessFile == null ? "" : " (offset: " + randomAccessFile.getFilePointer() + ")"));
                }
                if (line.contains("<page>") && line.trim().equals("<page>")) {
                    foundStartPage = true;
                    writeLine(line, out);
                } else if (line.contains("</page>") && line.trim().equals("</page>")) {
                    writeLine(line, out);
                    out.write("\n".getBytes());

                    pageCount++;
                    if ((pageCount % 1000) == 0) {
                        LOGGER.info("Processing page " + NUMBER_FORMATTER.format(pageCount));
                    }
                    foundStartPage = false;
                } else if (foundStartPage) {
                    writeLine(line, out);
                    out.write("\\n".getBytes());
                }
                lineNumber++;
            }
        } finally {
            reader.close();
        }
    }

    private void writeLine(String line, OutputStream out) throws IOException {
        out.write(line.trim().getBytes());
    }
}
TOP

Related Classes of io.lumify.wikipedia.mapreduce.WikipediaFileToMRFile

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.