Source Code of cn.edu.hfut.dmic.webcollector.generator.FSDbUpdater

/*
 * Copyright (C) 2014 hu
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */


package cn.edu.hfut.dmic.webcollector.generator;


import cn.edu.hfut.dmic.webcollector.fetcher.FSSegmentWriter;
import cn.edu.hfut.dmic.webcollector.fetcher.SegmentUtils;
import cn.edu.hfut.dmic.webcollector.fetcher.SegmentWriter;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.Link;
import cn.edu.hfut.dmic.webcollector.parser.ParseData;
import cn.edu.hfut.dmic.webcollector.util.Config;
import cn.edu.hfut.dmic.webcollector.util.FileUtils;
import cn.edu.hfut.dmic.webcollector.util.LogUtils;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;






/**
 *
 * @author hu
 */
public class FSDbUpdater implements DbUpdater{


    private SegmentWriter segmentWriter=null;
    private String crawlPath;


    private String segmentName;
   
    /**
     * 构建一个对指定爬取信息文件夹进行更新操作的更新器
     *
     * @param crawlPath
     */
    public FSDbUpdater(String crawlPath) {
        this.crawlPath = crawlPath;
        
    }
    
    protected String getLastSegmentName(){
        String[] segment_list=new File(crawlPath,"segments").list();
        if(segment_list==null){
            return null;
        }
        String segment_path=null;
        long max=0;
        for(String segment:segment_list){
            long timestamp=Long.valueOf(segment);
            if(timestamp>max){
                max=timestamp;
                segment_path=segment;
            }
        }
        return segment_path;
    }


    /**
     * 备份爬取任务列表
     * @throws IOException
     */
    public void backup() throws IOException {
        LogUtils.getLogger().info("backup "+getCrawlPath());
        File oldfile = new File(crawlPath, Config.old_info_path);
        File currentfile = new File(crawlPath, Config.current_info_path);
        FileUtils.copy(currentfile, oldfile);
    }


    /**
     * 判断更新器是否在上锁状态
     *
     * @return 是否上锁
     * @throws IOException
     */
    public boolean isLocked() throws IOException {
        File lockfile = new File(crawlPath + "/" + Config.lock_path);
        if (!lockfile.exists()) {
            return false;
        }
        String lock = new String(FileUtils.readFile(lockfile), "utf-8");
        return lock.equals("1");
    }


    /**
     * 上锁该更新器
     *
     * @throws IOException
     */
    public void lock() throws IOException {
        FileUtils.writeFile(crawlPath + "/" + Config.lock_path, "1".getBytes("utf-8"));
    }


    /**
     * 解锁该更新器
     *
     * @throws IOException
     */
    public void unlock() throws IOException {
        FileUtils.writeFile(crawlPath+ "/" + Config.lock_path, "0".getBytes("utf-8"));
    }
    // DataFileWriter<CrawlDatum> dataFileWriter;


    private void updateAll(ArrayList<CrawlDatum> datums) throws IOException {
        File currentfile = new File(crawlPath, Config.current_info_path);
        if (!currentfile.getParentFile().exists()) {
            currentfile.getParentFile().mkdirs();
        }
        DbWriter<CrawlDatum> writer = new DbWriter<CrawlDatum>(CrawlDatum.class, currentfile);
        for (CrawlDatum crawldatum : datums) {
            writer.write(crawldatum);
        }
        writer.close();
    }


    


    /**
     * 关闭该更新器
     *
     * @throws IOException
     */
    public void close() throws Exception {
        if(segmentWriter!=null){
            segmentWriter.close();
        }
    }


    /**
     * 将爬取记录和爬取任务列表合并，更新爬取任务列表
     *
     * @param segment_path
     * @throws IOException
     */
    @Override
    public void merge() throws IOException {
        if(segmentName==null){
            segmentName=getLastSegmentName(); 
        }
        if(segmentName==null){
            return;
        }
        
        
        try {
            backup();     
        } catch (IOException ex) {
            LogUtils.getLogger().info("Exception",ex);
        }
        
        LogUtils.getLogger().info("merge " + getSegmentPath());
        
        File file_fetch = new File(getSegmentPath(), "fetch/info.avro");
        if (!file_fetch.exists()) {
            return;
        }


        File file_current = new File(crawlPath, Config.current_info_path);
        DbReader<CrawlDatum> reader_current = new DbReader<CrawlDatum>(CrawlDatum.class, file_current);
        DbReader<CrawlDatum> reader_fetch = new DbReader<CrawlDatum>(CrawlDatum.class, file_fetch);


        HashMap<String, Integer> indexmap = new HashMap<String, Integer>();


        ArrayList<CrawlDatum> datums_origin = new ArrayList<CrawlDatum>();
        CrawlDatum datum = null;
        while (reader_current.hasNext()) {
            datum = reader_current.readNext();
            datums_origin.add(datum);
            indexmap.put(datum.getUrl(), datums_origin.size() - 1);
        }


        while (reader_fetch.hasNext()) {
            datum = reader_fetch.readNext();
            if (indexmap.containsKey(datum.getUrl())) {
                if (datum.getStatus() == CrawlDatum.STATUS_DB_UNFETCHED) {
                    continue;
                } else {
                    int preindex = indexmap.get(datum.getUrl());
                    datums_origin.set(preindex, datum);
                    indexmap.put(datum.getUrl(), preindex);
                }


            } else {
                datums_origin.add(datum);
                indexmap.put(datum.getUrl(), datums_origin.size() - 1);
            }


        }
        reader_fetch.close();


        File file_parse = new File(getSegmentPath(), "parse_data/info.avro");
        if (file_parse.exists()) {
            DbReader<ParseData> reader_parse = new DbReader<ParseData>(ParseData.class, file_parse);
            ParseData parseresult = null;
            while (reader_parse.hasNext()) {
                parseresult = reader_parse.readNext();
                for (Link link : parseresult.getLinks()) {
                    datum = new CrawlDatum();
                    datum.setUrl(link.getUrl());
                    datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
                    if (indexmap.containsKey(datum.getUrl())) {
                        continue;
                    } else {
                        datums_origin.add(datum);
                        indexmap.put(datum.getUrl(), datums_origin.size() - 1);
                    }
                }
            }
            reader_parse.close();
        }


        reader_current.close();


        updateAll(datums_origin);


    }


    


    @Override
    public SegmentWriter getSegmentWriter() {
        return segmentWriter;
    }


   


    public String getSegmentPath() {
        return crawlPath+"/segments/"+segmentName;
    }


    
   
    
    
  
    
    


   


    public String getCrawlPath() {
        return crawlPath;
    }


    public void setCrawlPath(String crawlPath) {
        this.crawlPath = crawlPath;
    }


    public String getSegmentName() {
        return segmentName;
    }


    public void setSegmentName(String segmentName) {
        this.segmentName = segmentName;
    }


    @Override
    public void clearHistory() {
        
        File file=new File(crawlPath);
        LogUtils.getLogger().info("clear "+file.getAbsolutePath());
        if(file.exists()){
            FileUtils.deleteDir(file);
        }
        
    }


    @Override
    public void initSegmentWriter() throws Exception {
        segmentName=SegmentUtils.createSegmengName();
        segmentWriter=new FSSegmentWriter(crawlPath, getSegmentPath());
    }




    
    
    
}
Source Code of cn.edu.hfut.dmic.webcollector.generator.FSDbUpdater

Related Classes of cn.edu.hfut.dmic.webcollector.generator.FSDbUpdater