Package com.fb2pdf.hadoop

Source Code of com.fb2pdf.hadoop.XMLTextReader

package com.fb2pdf.hadoop;

import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.*;

import com.fb2pdf.hadoop.OnDemandSAXParser.StringPair;


public class XMLTextReader implements RecordReader<Text, Text>
{

    private CompressionCodecFactory compressionCodecs;
    InputStream                     in;
    private long                    start;
    private long                    end;
    private OnDemandSAXParser       parser;
    private String                  stopElement = null;
    private boolean                 stopElementFound = false;

    public XMLTextReader(JobConf job, FileSplit split) throws IOException
    {
        this.stopElement = job.get("fb2.xmlreader.stopelement", null);

        final Path file = split.getPath();
        start = split.getStart();
        end = start + split.getLength();

        compressionCodecs = new CompressionCodecFactory(job);
        final CompressionCodec codec = compressionCodecs.getCodec(file);

        FileSystem fs = file.getFileSystem(job);
        FSDataInputStream fileIn = fs.open(split.getPath());

        fileIn.seek(start);
        if(codec != null)
            in = codec.createInputStream(fileIn);
        else
            in = fileIn;

        parser = new OnDemandSAXParser(in);
    }

    @Override
    public void close() throws IOException
    {
        parser.stop();
        IOUtils.closeQuietly(in);
    }

    @Override
    public Text createKey()
    {
        return new Text();
    }

    @Override
    public Text createValue()
    {
        return new Text();
    }

    @Override
    public long getPos() throws IOException
    {
        return start; // TODO: implement
    }

    @Override
    public float getProgress() throws IOException
    {
        if(start == end)
            return 0.0f;
        else
            return Math.min(1.0f, (getPos() - start) / (float) (end - start));
    }

    @Override
    public boolean next(Text key, Text value) throws IOException
    {
        StringPair n = parser.getNext();
        if(n != null)
        {
            if (stopElement != null && n.a != null)
            {
                if (!stopElementFound)
                    stopElementFound = n.a.startsWith(stopElement);
                else if (!n.a.startsWith(stopElement))
                    return false;
            }

            key.set(n.a);
            value.set(n.b);
            return true;
        } else
        {
            return false; // EOF
        }
    }

}
TOP

Related Classes of com.fb2pdf.hadoop.XMLTextReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.