Package com.alimama.mdrill.index

Source Code of com.alimama.mdrill.index.IndexReducer

package com.alimama.mdrill.index;
import java.io.IOException;
import java.io.StringReader;
import java.util.Iterator;

import javax.xml.parsers.ParserConfigurationException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.TermInfosWriter;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.schema.IndexSchema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.alimama.mdrill.index.utils.DocumentConverter;
import com.alimama.mdrill.index.utils.DocumentList;
import com.alimama.mdrill.index.utils.DocumentMap;
import com.alimama.mdrill.index.utils.HeartBeater;
import com.alimama.mdrill.index.utils.JobIndexPublic;
import com.alimama.mdrill.index.utils.PairWriteable;
import com.alimama.mdrill.index.utils.RamWriter;
import com.alimama.mdrill.index.utils.ShardWriter;


public class IndexReducer extends  Reducer<PairWriteable, DocumentMap, IntWritable, Text> {
    public static Logger LOG = LoggerFactory.getLogger(IndexReducer.class);

    private HeartBeater heartBeater = null;
    private ShardWriter shardWriter = null;
    private String tmpath = null;
    private String localtmpath = null;
    private String indexHdfsPath = null;

    private Analyzer analyzer;
    private DocumentConverter documentConverter = null;
    DocumentList doclistcache=new DocumentList();
    RamWriter ramMerger=null;
  boolean isNotFdtMode=true;

    private String[] fields = null;

    private int Index=0;
  protected void setup(Context context) throws java.io.IOException,
      InterruptedException {
    super.setup(context);
    debugInfo=0;
    doccount=0;
    TaskID taskId = context.getTaskAttemptID().getTaskID();
    this.Index = taskId.getId();

    context.getCounter("higo", "dumpcount").increment(0);

    Configuration conf = context.getConfiguration();
    isNotFdtMode=conf.get("mdrill.table.mode","").indexOf("@hdfs@")<0;
   
    String fieldStrs = context.getConfiguration().get("higo.index.fields");

    String custfields=context.getConfiguration().get("higo.column.custfields","");
   
    if(custfields==null||custfields.isEmpty())
    {
      String[] fieldslist = fieldStrs.split(",");
      this.fields = new String[fieldslist.length];
   
      for (int i = 0; i < fieldslist.length; i++) {
          String[] fieldSchema = fieldslist[i].split(":");
          String fieldName = fieldSchema[0].trim().toLowerCase();
          this.fields[i] = fieldName;
      }
    }else{
      String[] fieldslist = custfields.split(",");
      this.fields = new String[fieldslist.length];
   
      for (int i = 0; i < fieldslist.length; i++) {
          this.fields[i] = fieldslist[i];
      }
    }

    TermInfosWriter.setNotUseQuick(true);
    if(!isNotFdtMode)
    {
      TermInfosWriter.setSkipInterVal(16);
    }

    heartBeater = new HeartBeater(context);
    heartBeater.needHeartBeat();
    this.doclistcache=new DocumentList();
    this.ramMerger = new RamWriter();

    String[] fieldslist = fieldStrs.split(",");
    this.documentConverter = new DocumentConverter(fieldslist,"solrconfig.xml", "schema.xml");
    shardWriter = this.initShardWriter(context);
    LOG.info("end initShardWriter");

    try {
      this.analyzer = this.documentConverter .getAnalyzer();JobIndexPublic.setAnalyzer(conf);
    } catch (Exception e) {
      throw new IOException(e);
    }
   
    LOG.info("end set up");


  }
 
  public static void main(String[] args) throws IOException, ParserConfigurationException, SAXException, ClassNotFoundException {
   
    IndexSchema schema=null;
    SolrConfig solrConfig = new SolrConfig("E:\\一淘svn\\higo\\trunk\\adhoc-core\\solr\\conf\\solrconfig.xml");
      InputSource is = new InputSource(solrConfig.getResourceLoader().openSchema("E:\\一淘svn\\higo\\trunk\\adhoc-core\\solr\\conf\\schema.xml"));
      schema =new IndexSchema(solrConfig, "solrconfig",is);
    String s = "Ab94aa4CdDbd34dfde082ed1b4c4d0c505b69";

    StringReader sr = new StringReader(s);
//    Analyzer analyzer =new StandardAnalyzer((Version) Enum.valueOf((Class) Class.forName("org.apache.lucene.util.Version"),  Version.LUCENE_35.name()));
    Analyzer analyzer = schema.getAnalyzer();//JobIndexPublic.setAnalyzer(conf);
    TokenStream tk=analyzer.tokenStream("rawquery", sr);

    boolean hasnext = tk.incrementToken();

    while(hasnext){

      TermAttribute ta = tk.getAttribute(TermAttribute.class);

      System.out.println(ta.term());

      hasnext = tk.incrementToken();

    }
  }

    protected void cleanup(Context context) throws IOException,  InterruptedException {
    try {
      LOG.info("begin clean up");
      RamWriter ram = doclistcache.toRamWriter(documentConverter, analyzer,context);
      ramMerger.process(ram);
      if (this.maybeFlush(ramMerger,context,true)) {
        ramMerger = null;
      }
      doclistcache=new DocumentList();
      shardWriter.optimize();
      shardWriter.close();
      Configuration conf = context.getConfiguration();
      FileSystem fs = FileSystem.get(conf);
      fs.copyFromLocalFile(new Path(localtmpath), new Path(tmpath));

      if (!fs.exists(new Path(indexHdfsPath))) {
        fs.rename(new Path(tmpath), new Path(indexHdfsPath));
      }
      if (shardWriter.getNumDocs() > 0 && lastkey != null) {
        TaskID taskId = context.getTaskAttemptID().getTaskID();
        int partition = taskId.getId();
        System.out.println("###########>>>>"+partition);
        context.write(new IntWritable(partition),new Text(indexHdfsPath));
      }
      FileSystem lfs = FileSystem.getLocal(conf);
      if(lfs.exists(new Path(localtmpath)))
      {
        lfs.delete(new Path(localtmpath),true);
      }

    } catch (Throwable e) {
      LOG.error("cleanup",e);

      throw new IOException(e);
    }

    heartBeater.cancelHeartBeat();
    heartBeater.interrupt();
  }

  private ShardWriter initShardWriter(Context context) throws IOException {
    String part_xxxxx = JobIndexPublic.getOutFileName(context, "part");
    Configuration conf = context.getConfiguration();
    FileSystem lfs = FileSystem.getLocal(conf);

    String outputdir = conf.get("mapred.output.dir");
    indexHdfsPath = new Path(outputdir, part_xxxxx).toString();

    tmpath = new Path(outputdir + "/_tmpindex", part_xxxxx + "_" + java.util.UUID.randomUUID().toString()).toString();
    localtmpath = new Path("./_tmpindex", part_xxxxx + "_" + java.util.UUID.randomUUID().toString()).toString();
   
    ShardWriter shardWriter = new ShardWriter(lfs, localtmpath, conf);
    return shardWriter;
  }

    private IntWritable lastkey = null;
    private int debuglines=0;

    long debugInfo=0;
  protected void reduce(PairWriteable key, Iterable<DocumentMap> values,
      Context context) throws java.io.IOException, InterruptedException {
    if(debugInfo%10000==0)
    {
      LOG.info("debugInfo:"+debugInfo);
      if(debugInfo>maxDocCount_l)
      {
        LOG.info("debugInfo>maxDocCount_l:"+debugInfo);
        return ;
      }
    }
    debugInfo++;

    if(!key.isNum())
    {
      int dumps=0;
      Iterator<DocumentMap> iterator = values.iterator();
      while (iterator.hasNext()) {
        DocumentMap doclist = iterator.next();
        dumps++;
      }
      if(dumps>1)
      {
        context.getCounter("higo", "dumpcount").increment(1);;
        if(debuglines<100)
          {
            debuglines++;
              System.out.println("dumpcount: " + key.toString()   + "");
          }
      }
      return ;
    }
   
    lastkey = new IntWritable(key.getIndex());
   
    Iterator<DocumentMap> iterator = values.iterator();
    while (iterator.hasNext()) {
      if(doccount>maxDocCount||debugInfo>maxDocCount_l)
      {
        LOG.info("count over:"+debugInfo);

        break ;
      }
     
      DocumentMap map=iterator.next();
     
      int addcnt=doclistcache.add(map,this.fields);
      if(addcnt<=0)
      {
        context.getCounter("higo", "addempty").increment(1);
        continue;
      }
      if(!doclistcache.isoversize())
      {
        continue;
      }
      RamWriter ram = doclistcache.toRamWriter(documentConverter, analyzer,context);
      doclistcache=new DocumentList();

     
      ramMerger.process(ram);
      if (this.maybeFlush(ramMerger,context,false)) {
        ramMerger =  new RamWriter();;
      }
    }
  }
   
 
   
    private long minsize=1024l*1024*32;
    private static long maxDocCount=10000l*1000*10;
    private static long maxDocCount_l=10000l*1000*100;

    long doccount=0;
  private boolean maybeFlush(RamWriter form,
      Context context,boolean fource)
      throws IOException {
    if (form == null) {
      return false;
    }
    Integer docs=form.getNumDocs();
    if(docs<=0)
    {
      return false;
    }
    if ((docs>=1000&&form.totalSizeInBytes()>minsize)||fource||docs>=10000) {
      try{

      context.getCounter("higo", "docCount").increment(docs);;
      doccount+=docs;
      form.closeWriter();

      shardWriter.process(form);

      form.closeDir();
      }catch(Throwable e)
      {
        LOG.error("maybeFlush error",e);
        throw new IOException(e);
      }

      return true;
    }

    return false;
  }
}
TOP

Related Classes of com.alimama.mdrill.index.IndexReducer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.