Package com.alimama.mdrill.index

Source Code of com.alimama.mdrill.index.IndexMapper

package com.alimama.mdrill.index;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import com.alimama.mdrill.index.utils.DocumentMap;
import com.alimama.mdrill.index.utils.JobIndexPublic;
import com.alimama.mdrill.index.utils.PairWriteable;
import com.alimama.mdrill.index.utils.TdateFormat;



public class IndexMapper extends   Mapper<WritableComparable, Text, PairWriteable, DocumentMap> {
    private String[] fields = null;
    private Boolean[] isDate;
    private Boolean[] isString;
    private Boolean[] isStore;
   
    private String[][] contains;
    private boolean  containsfilter=false;
   
    private String split="\001";
    private boolean usedthedate=true;
    private String thedate=null;
   
    private Integer Index=(int) (Math.random()*10000);
   
    private String uniqfield="";
    private int uniqfieldIndex=-1;
    private boolean isuniqcheck=false;
   
    private int thedateIndex=-1;

   
    @Override
    public void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
   
 
   
    TaskID taskId = context.getTaskAttemptID().getTaskID();
    this.Index = taskId.getId();
    System.out.println("###########>>>>"+this.Index);
    Configuration conf = context.getConfiguration();

    String mode=conf.get("mdrill.table.mode","");
    HashMap<String,ArrayList<String>> contanis=new HashMap<String, ArrayList<String>>();
   
    if(mode.indexOf("@fieldcontains:")>=0)
    {
     
       Pattern mapiPattern      = Pattern.compile("@fieldcontains:([^@]+)@");
       Matcher mat=mapiPattern.matcher(mode);
           while (mat.find()) {
             String matchStr= mat.group(1);
             String[] kv=matchStr.split("&");
             for(String s:kv)
             {
               String[] kvpair=s.split("=");
               if(kvpair.length>=2){
                 ArrayList<String> list=contanis.get(kvpair[0]);
                 if(list==null)
                 {
                   list=new ArrayList<String>();
                   contanis.put(kvpair[0], list);
                 }
                 list.add( kvpair[1]);
               }
             }
            
           }
     
     
    }
   
    containsfilter=contanis.size()>0;

    String fieldStrs = conf.get("higo.index.fields");
    this.uniqfield= conf.get("uniq.check.field");
    if(this.uniqfield!=null&&this.uniqfield.length()>0)
    {
      this.isuniqcheck=true;
    }
    split=MakeIndex.parseSplit(conf.get("higo.column.split",split));
    String custfields=conf.get("higo.column.custfields","");
    usedthedate=conf.getBoolean("higo.column.userthedate", usedthedate);
    this.thedate=null;
     InputSplit inputSplit = context.getInputSplit();
       Path filepath = ((FileSplit) inputSplit).getPath();
      
       if(filepath!=null)
       {
         String pash=filepath.toString();
        this.Index=pash.hashCode()%1000000;
        if(this.Index<0)
        {
         this.Index*=-1;
        }
       
       }
    if(usedthedate)
    {
   
       String inputbase = conf.get("higo.input.base");
       this.thedate=JobIndexPublic.parseThedate(new Path(inputbase),filepath);
     System.out.println("thedatepath: " + thedate+"@"+filepath.toString() +"@"+inputbase + "");
    }
   

   
    if(custfields==null||custfields.isEmpty())
    {
      String[] fieldslist = fieldStrs.split(",");
      this.fields = new String[fieldslist.length];
      this.isDate = new Boolean[fieldslist.length];
      this.isString = new Boolean[fieldslist.length];
      this.isStore = new Boolean[fieldslist.length];
      this.contains=new String[fieldslist.length][];
   
      for (int i = 0; i < fieldslist.length; i++) {
          String[] fieldSchema = fieldslist[i].split(":");
          String fieldName = fieldSchema[0].trim().toLowerCase();
          String type = fieldSchema[1];
          this.fields[i] = fieldName;
         
          ArrayList<String> filterlist=contanis.get(fieldName);
          if(filterlist==null)
          {
            this.contains[i]=null;
          }else{
            String[] filterarr=new String[filterlist.size()];
            this.contains[i]=filterlist.toArray(filterarr);
          }
         

          if(this.fields[i].equals("thedate"))
          {
            thedateIndex=i;
          }
         
          if(this.isuniqcheck)
          {
            if(this.fields[i].equals(this.uniqfield))
            {
              uniqfieldIndex=i;
            }
          }
          this.isStore[i] = Boolean.valueOf(fieldSchema[3]);
          this.isDate[i] = type.equalsIgnoreCase("tdate");
          this.isString[i] = type.equalsIgnoreCase("string");
      }
    }else{
      String[] fieldslist = custfields.split(",");
      this.fields = new String[fieldslist.length];
      this.isDate = new Boolean[fieldslist.length];
      this.isString = new Boolean[fieldslist.length];
      this.isStore = new Boolean[fieldslist.length];
   
      for (int i = 0; i < fieldslist.length; i++) {
          this.isStore[i] = Boolean.valueOf(false);
          this.fields[i] = fieldslist[i];
          if(this.fields[i].equals("thedate"))
          {
            thedateIndex=i;
          }
          if(this.isuniqcheck)
          {
            if(this.fields[i].equals(this.uniqfield))
            {
              uniqfieldIndex=i;
            }
          }
          this.isDate[i]= false;
          this.isString[i] = true;
      }
    }
    }

    protected void cleanup(Context context) throws IOException,
      InterruptedException {
      
    }
   
    private String parseDefault(String input,Context context)
    {
      if (input == null) {
      return null;
    }
      input=input.trim();
    if (input.isEmpty() || input.equals("\\N")|| input.equals("\\n")|| input.toLowerCase().equals("null")) {
        context.getCounter("higo", "nullcolcount").increment(1);
      return null;
    }
   
    if(input.length()>=512000)
    {
        context.getCounter("higo", "bigtextskip").increment(1);

      return null;
    }

    return input;
    }
 
    private int debuglines=0;
    private int printlines=0;
   
    private boolean validate(String[] values,String record,Context context)
    {
      if(usedthedate)
      {
        if(values.length<2)
        {
          if(debuglines<100)
          {
            debuglines++;
              System.out.println("miss columns values2: " + record.replaceAll(split, "#")   + "");
          }
          context.getCounter("higo", "skiprecords").increment(1);
          return false;
        }
      }else{
          if(parseDefault(record,context)==null)
          {
          return false;
          }

      }
      return true;
    }
   
    private boolean line(String record,Context context) throws IOException, InterruptedException
    {
    context.getCounter("higo", "totalrecord").increment(1);
      String[] values = record.split(split,-1);
      if(!this.validate(values, record, context))
      {
        return false;
      }

      String[] res =new String[fields.length];
      for (int i = 0; i < fields.length; i++) {
          String fieldName = fields[i];
          String string =(i<values.length)?values[i]:null;
          String val=parseDefault(string,context);

      if (this.isDate[i]) {
        res[i]=TdateFormat.ensureTdate(val, fieldName);
      }else if(val!=null){
        res[i]=val;
      }else if(this.isString[i]){
        res[i]="_";
      }
      }
     
      if(usedthedate&&thedateIndex>=0)
      { 
        if(thedate!=null)
        {
          res[thedateIndex]=thedate;
        }
      res[thedateIndex]=String.valueOf(res[thedateIndex]).replaceAll("-", "").replaceAll("_", "");

        if(res[thedateIndex]==null||res[thedateIndex].length()!=8)
        {
          if(debuglines<100)
          {
            debuglines++;
            System.out.println("miss thedate values: " + record.replaceAll(split, "#")   + "");
          }
          context.getCounter("higo", "skiprecords").increment(1);
        }
       
        context.getCounter("higo", "dayrecord_"+String.valueOf(res[thedateIndex])).increment(1);
      }
     
     
      if(printlines<10)
    {
        printlines++;
        System.out.println("res:"+Arrays.toString(values));
    }
     
      if(this.containsfilter)
      {
        int maxlen=res.length;
        for(int i=0;i<this.contains.length;i++)
        {
          String[] containslist=this.contains[i];
         
          if(containslist!=null)
          {
            if(i>=maxlen)
            {
              context.getCounter("higo", "skiprecords_filter").increment(1);
              return true;
            }
           
           
            String val=res[i];
            for(String s:containslist)
            {
              if(val.indexOf(s)<0)
              {
                  context.getCounter("higo", "skiprecords_filter").increment(1);
                return true;
              }
            }
          }
         
        }
      }
     
      context.write(new PairWriteable(this.Index++), new DocumentMap(res));

     
      if(this.isuniqcheck&&uniqfieldIndex>0&&res[uniqfieldIndex]!=null)
      {
        String notempty=res[uniqfieldIndex];
        if(notempty.length()>0&&!notempty.equals("_"))
        {
          context.write(new PairWriteable(new Text("uniq_"+notempty)), new DocumentMap());
        }
      }
     
      return true;
       
    }

    @Override
    public void map(WritableComparable key, Text value, Context context)
throws IOException, InterruptedException {
    String[] records = value.toString().split("[\n]+");
    for(String record:records)
    {
      this.line(record, context);
    }
  }

}
TOP

Related Classes of com.alimama.mdrill.index.IndexMapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.