Source Code of org.apache.pig.piggybank.storage.XMLLoader$XMLRecordReader

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.pig.piggybank.storage;


import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextInputFormat;
import org.apache.pig.bzip2r.Bzip2TextInputFormat;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;


/**
 * Parses an XML input file given a specified identifier of tags to be loaded.
 * The output is a bag of XML elements where each element is returned as
 * a chararray containing the text of the matched XML element including the
 * start and tags as well as the data between them. In case of nesting elements
 * of the matching tags, only the top level one is returned.
 *
 */
public class XMLLoader extends LoadFunc {


  /**
   * Use this record reader to read XML tags out of a text file. It matches only
   * the tags identified by an identifier configured through a call to
   * {@link #setXMLIdentifier(String)}. It there are nesting tags of the given
   * identifier, only the top level one is returned which also includes all
   * enclosed tags.
   */
  public static class XMLRecordReader extends RecordReader<LongWritable, Text> {
    protected final RecordReader<LongWritable, Text> wrapped;


    /**Regular expression for XML tag identifier*/
    private static final String XMLTagNameRegExp = "[a-zA-Z\\_][0-9a-zA-Z\\-_]+";
    /**
     * A regular expression that matches key parts in the XML text needed to
     * correctly parse it and find matches of the given identifier
     */
    private Pattern identifiersPattern;


    private LongWritable key;
    private Text value;


    /**Position of the current buffer in the file*/
    private long bufferPos;


    /**Holds parts of the input file that were read but not parsed yet*/
    private String buffer;


    /**Original end of the split to parse*/
    private long originalEnd;


    private boolean terminated;


    public XMLRecordReader(RecordReader<LongWritable, Text> wrapped) {
      this.wrapped = wrapped;
    }


    /**
     * Delegate the initialization method to the wrapped stream after changing
     * the length of the split to be non-ending.
     */
    @Override
    public void initialize(InputSplit split, TaskAttemptContext context)
  throws IOException, InterruptedException {
      key = new LongWritable();
      value = new Text();
      if (split instanceof FileSplit) {
  FileSplit fsplit = (FileSplit) split;
  originalEnd = fsplit.getStart() + fsplit.getLength();
  Path path = fsplit.getPath();
  long fileEnd = path.getFileSystem(context.getConfiguration()).getFileStatus(path).getLen();
  FileSplit extendedSplit = new FileSplit(path, fsplit.getStart(),
      Math.min(fsplit.getLength() * 10, fileEnd - fsplit.getStart()), fsplit.getLocations());
  this.wrapped.initialize(extendedSplit, context);
      } else {
  throw new RuntimeException("Cannot override a split of type'"+
      split.getClass()+"'");
      }
    }


    public void setXMLIdentifier(String identifier) {
      if (!identifier.matches(XMLTagNameRegExp))
  throw new RuntimeException("XML tag identifier '"+identifier+"' does not match the regular expression /"+XMLTagNameRegExp+"/");
      String inlineClosedTagRegExp = "<\\s*"+identifier+"\\s*[^>]*/>";
      String openTagRegExp = "<\\s*"+identifier+"(?:\\s*|\\s+(?:[^/>]*|[^>]*[^>/]))>";
      String closeTagRegExp = "</\\s*"+identifier+"\\s*>";
      identifiersPattern = Pattern.compile("("+inlineClosedTagRegExp+")|("+openTagRegExp+")|("+closeTagRegExp+")");
    }


    /* Delegate all methods to the wrapped stream */
    public void close() throws IOException {
      wrapped.close();
    }


    public boolean equals(Object obj) {
      return wrapped.equals(obj);
    }


    public LongWritable getCurrentKey() throws IOException, InterruptedException {
      return key;
    }


    public Text getCurrentValue() throws IOException, InterruptedException {
      return value;
    }


    public float getProgress() throws IOException, InterruptedException {
      return Math.max(1.0f, this.wrapped.getProgress() * 10);
    }


    public int hashCode() {
      return wrapped.hashCode();
    }


    public boolean nextKeyValue() throws IOException, InterruptedException {
      if (this.terminated)
  return false;
      int depth = 0;
      // In case of an tag matched with an open tag and a closed tag, this buffer
      // is used to accumulate matched element if it is spans multiple lines.
      StringBuffer currentMatch = new StringBuffer();
      try {
      while (true) {
          // The start offset of first matched open tag. This marks the first byte
          // in the range to be copied to output.
          int offsetOfFirstMatchedOpenTag = 0;
        
  while (buffer == null || buffer.length() == 0) {
    if (!wrapped.nextKeyValue())
      return false; // End of split
    // if passed the end offset of current split, terminate the matching
    if (bufferPos >= originalEnd && depth == 0) {
      this.terminated = true;
      return false;
    }


    bufferPos = wrapped.getCurrentKey().get();
    buffer = wrapped.getCurrentValue().toString();
  }
  Matcher matcher = identifiersPattern.matcher(buffer);
  while (matcher.find()) {
    int startOfCurrentMatch = matcher.start();
    int endOfCurrentMatch = matcher.end();
    String group;
    if ((group = matcher.group(1)) != null) {
      // Matched an inline-closed tag
      value = new Text(group);
      this.key.set(bufferPos + matcher.start(1));
      bufferPos += matcher.end(1);
      buffer = buffer.substring(endOfCurrentMatch);
      return true;
    } else if ((group = matcher.group(2)) != null) {
      // Matched an open tag
      // If this is a top-level match (i.e., not enclosed in another matched
      // tag), all bytes starting from this offset will be copied to output
      // in one of two cases:
      // 1- When a matching close tag is found
      // 2- When an end of line is encountered
      if (depth == 0) {
        offsetOfFirstMatchedOpenTag = startOfCurrentMatch;
        this.key.set(bufferPos + startOfCurrentMatch);
      }
      depth++;
    } else if ((group = matcher.group(3)) != null) {
      // Matched a closed tag
      if (depth > 0) {
        depth--;
        if (depth == 0) {
    // A full top-level match
    // Copy all bytes to output
    if (currentMatch.length() == 0) {
      // A full match in one line, return it immediately
      value = new Text(buffer.substring(offsetOfFirstMatchedOpenTag, endOfCurrentMatch));
    } else {
      currentMatch.append(buffer, offsetOfFirstMatchedOpenTag, endOfCurrentMatch);
      value = new Text(currentMatch.toString());
    }
    // Copy remaining non matched part to the buffer for next call
    buffer = buffer.substring(endOfCurrentMatch);
    bufferPos += endOfCurrentMatch;
    return true;
        }
      }
    } else {
      throw new RuntimeException("Invalid match '"+matcher.group()+"' in string '"+buffer+"'");
    }
  }
  // No more matches in current line. If we are inside a match (i.e.,
  // an open tag has been matched) copy all parts to the match.
  // Otherwise, just drop it.
  if (depth > 0) {
    // Inside a match
    currentMatch.append(buffer, offsetOfFirstMatchedOpenTag, buffer.length());
  }
  buffer = null;
      }
      } catch (InterruptedException e) {
  throw new IOException("Error getting input");
      }


    }


    public String toString() {
      return wrapped.toString();
    }
  }


  /**Location of the file loaded*/
  private String loadLocation;


  /**Underlying record reader*/
  @SuppressWarnings("rawtypes")
  protected RecordReader in = null;


  /**XML tag to parse*/
  private String identifier;


  public XMLLoader(String identifier) {
    this.identifier = identifier;
  }


  @Override
  public void prepareToRead(RecordReader reader, PigSplit split)
      throws IOException {
    in = reader;
  }


  @Override
  public Tuple getNext() throws IOException {
    try {
      if (!in.nextKeyValue())
  return null;
      Tuple tuple = createTuple(in.getCurrentValue().toString());
      return tuple;
    } catch (InterruptedException e) {
      e.printStackTrace();
      return null;
    }
  }




  /**
   * Creates a tuple from a matched string
   */
  public Tuple createTuple(String str) {
    return TupleFactory.getInstance().newTuple(new DataByteArray(str));
  }


  @SuppressWarnings("rawtypes")
  @Override
  public InputFormat getInputFormat() throws IOException {
    if(loadLocation.endsWith(".bz2") || loadLocation.endsWith(".bz")) {
      return new Bzip2TextInputFormat() {
  @Override
  public RecordReader<LongWritable, Text> createRecordReader(
      InputSplit split, TaskAttemptContext context) {
    try {
      RecordReader<LongWritable, Text> originalReader =
    super.createRecordReader(split, context);
      XMLRecordReader reader = new XMLRecordReader(originalReader);
      reader.setXMLIdentifier(identifier);
      return reader;
    } catch (IOException e) {
      throw new RuntimeException("Cannot create input split", e);
    } catch (InterruptedException e) {
      throw new RuntimeException("Cannot create input split", e);
    }
  }
      };
    } else {
      return new PigTextInputFormat() {
  @Override
  public RecordReader<LongWritable, Text> createRecordReader(
      InputSplit split, TaskAttemptContext context) {
    RecordReader<LongWritable, Text> originalReader =
        super.createRecordReader(split, context);
    XMLRecordReader reader = new XMLRecordReader(originalReader);
    reader.setXMLIdentifier(identifier);
    return reader;
  }
      };
    }
  }


  @Override
  public void setLocation(String location, Job job) throws IOException {
    loadLocation = location;
    FileInputFormat.setInputPaths(job, location);
  }
}
Source Code of org.apache.pig.piggybank.storage.XMLLoader$XMLRecordReader

Related Classes of org.apache.pig.piggybank.storage.XMLLoader$XMLRecordReader