package com.alimama.mdrill.index.utils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.text.NumberFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
import backtype.storm.topology.TopologyBuilder;
import com.alimama.mdrill.utils.EncodeUtils;
import com.alimama.mdrill.utils.HadoopBaseUtils;
import com.alimama.mdrill.utils.HadoopUtil;
import com.alimama.mdrill.utils.UniqConfig;
public class JobIndexPublic {
public static String getOutFileName(TaskAttemptContext context,
String prefix) {
TaskID taskId = context.getTaskAttemptID().getTaskID();
int partition = taskId.getId();
NumberFormat nf = NumberFormat.getInstance();
nf.setMinimumIntegerDigits(5);
nf.setGroupingUsed(false);
StringBuilder result = new StringBuilder();
result.append(prefix);
result.append("-");
result.append(nf.format(partition));
return result.toString();
}
public static Analyzer setAnalyzer(Configuration _conf) throws Exception {
return new StandardAnalyzer((Version) Enum.valueOf((Class) Class.forName("org.apache.lucene.util.Version"), Version.LUCENE_35.name()));
}
public static String readFieldsFromSchemaXml(String schemaFile,
FileSystem fs, Configuration conf) throws Exception {
String regex = "<field\\s+name=\"([^\"]*?)\"\\s+type=\"([^\"]*?)\"\\s+indexed=\"([^\"]*?)\"\\s+stored=\"([^\"]*?)\"\\s*.*/>";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher("");
List<String> list = new ArrayList<String>();
BufferedReader br = null;
String[] fields = null;
try {
FSDataInputStream in=fs.open(new Path(schemaFile));
br = new BufferedReader(new InputStreamReader(in));
String temp = null;// 获得的值以空格分隔,"fieldName fieldType"
while ((temp = br.readLine()) != null) {
matcher.reset(temp);
if (matcher.find()) {
String fnft = matcher.group(1) + " " + matcher.group(2)
+ " " + matcher.group(3) + " " + matcher.group(4);
System.out.println(fnft);
list.add(fnft);
}
}
in.close();
} finally {
if (br != null) {
br.close();
}
}
StringBuilder result = new StringBuilder();
fields = list.toArray(new String[0]);
for (int i = 0; i < fields.length; i++) {
String field = fields[i];
String[] ss = field.split(" ");
if (ss.length == 4) {
String temp = field.replaceAll(" ", ":");
result.append(temp);
if (i != fields.length - 1) {
result.append(",");
}
}
}
return result.toString();
}
private static String findContainingJar(Class<?> myClass) {
ClassLoader loader = myClass.getClassLoader();
String class_file = myClass.getName().replaceAll("\\.", "/") + ".class";
if(loader==null)
{
return null;
}
try {
for (Enumeration<URL> itr = loader.getResources(class_file); itr.hasMoreElements();) {
URL url = itr.nextElement();
if ("jar".equals(url.getProtocol())) {
String toReturn = url.getPath();
if (toReturn.startsWith("file:")) {
toReturn = toReturn.substring("file:".length());
}
toReturn = URLDecoder.decode(toReturn, "UTF-8");
return toReturn.replaceAll("!.*$", "");
}
}
} catch (IOException e) {}
return null;
}
private static void addpath(HashSet<String> jars,Class<?> myClass)
{
String path=findContainingJar(myClass);
if(path==null)
{
return ;
}
jars.add(path);
}
public static void setJars(Configuration conf)
throws IOException, URISyntaxException {
HashSet<String> jars = new HashSet<String>();
addpath(jars, UniqConfig.class);
addpath(jars, HadoopBaseUtils.class);
addpath(jars, EncodeUtils.class);
addpath(jars, HadoopUtil.class);
addpath(jars, TopologyBuilder.class);
addpath(jars, org.slf4j.Logger.class);
addpath(jars, org.slf4j.LoggerFactory.class);
addpath(jars, org.apache.commons.httpclient.HttpClient.class);
addpath(jars, org.apache.commons.io.IOUtils.class);
addpath(jars, org.apache.solr.request.mdrill.MdrillDetail.class);
addpath(jars, org.xml.sax.SAXException.class);
jars.remove(findContainingJar(JobIndexPublic.class));
StringBuilder b = new StringBuilder();
String join = "";
for (String s : jars) {
b.append(join);
b.append(s);
join = ",";
}
new GenericOptionsParser(conf, new String[]{"-libjars",b.toString()});
System.out.println("tmpjars:"+b.toString()+"@@@@@"+conf.get("tmpjars"));
}
public static void setDistributecache(Path distpath, FileSystem fs,
Configuration conf) throws IOException, URISyntaxException {
DistributedCache.createSymlink(conf);
FileStatus[] flist = fs.listStatus(distpath);
for (FileStatus f : flist) {
if (f.isDir()) {
continue;
}
DistributedCache.addCacheFile(new URI(new Path(distpath, f
.getPath().getName()).toUri().toString()
+ "#"
+ f.getPath().getName()), conf);
}
}
public static void main(String[] args) {
System.out.println(parseThedate("dt=20120321"));
System.out.println(parseThedate("dt=20120321000000"));
System.out.println(parseThedate("ef=20120321000000"));
System.out.println(parseThedate("dt=2012032w"));
System.out.println(parseThedate("dt=201203212"));
System.out.println(parseThedate("201203212"));
System.out.println(parseThedate("201203212123"));
System.out.println(parseThedate("201203212877d"));
System.out.println("===");
System.out.println(parseThedate(new Path("/xxx/xxx//xxx/x/xxxxx///x"), new Path("/xxx/////////////.////////./////////////////////////xxx//xxx/x/xxxxx///x","a/dt=20120321/abc.txt")));
System.out.println(parseThedate(new Path("/xxx/xxx//xxx/x/xxxxx///x"), new Path("/xxx/xxx//xxx/x/xxxxx///x","a/dt=20120321//e/abc.txt")));
System.out.println(parseThedate(new Path("/xxx/xxx//xxx/x/xxxxx///x"), new Path("/xxx/xxx//xxx/x/xxxxx///x","a/dt=20120321//e/dt=20120322")));
System.out.println(parseThedate(new Path("/xxx/xxx/./xxx/x/xxxxx///x"), new Path("/xxx/xxx//////////////////xxx/x/xxxxx///x","a/dt=20120321//e/dt=20120322")));
}
public static String parseThedate(String name)
{
if(name.indexOf("=") >= 0)
{
name = name.replaceAll(".*=", "");
}
if(name.length()>8)
{
name=name.substring(0,8);
}
if (name.length()==8 && name.matches("\\d{8}")) {
SimpleDateFormat fmt = new SimpleDateFormat("yyyyMMdd");
try {
fmt.parse(name);
return name;
} catch (ParseException e) {
}
}
return null;
}
public static String parseThedate(Path base,Path p)
{
Path parent=p.getParent();
while(!parent.toUri().equals(base.toUri())&&base.toUri().compareTo(parent.toUri())<=0)
{
String name=parseThedate(parent.getName());
if(name!=null)
{
return name;
}
parent=parent.getParent();
}
return null;
}
}