Package cn.edu.hfut.dmic.webcollector.generator

Source Code of cn.edu.hfut.dmic.webcollector.generator.FSDbUpdater

/*
* Copyright (C) 2014 hu
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/

package cn.edu.hfut.dmic.webcollector.generator;

import cn.edu.hfut.dmic.webcollector.fetcher.FSSegmentWriter;
import cn.edu.hfut.dmic.webcollector.fetcher.SegmentUtils;
import cn.edu.hfut.dmic.webcollector.fetcher.SegmentWriter;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.Link;
import cn.edu.hfut.dmic.webcollector.parser.ParseData;
import cn.edu.hfut.dmic.webcollector.util.Config;
import cn.edu.hfut.dmic.webcollector.util.FileUtils;
import cn.edu.hfut.dmic.webcollector.util.LogUtils;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;



/**
*
* @author hu
*/
public class FSDbUpdater implements DbUpdater{

    private SegmentWriter segmentWriter=null;
    private String crawlPath;

    private String segmentName;
  
    /**
     * 构建一个对指定爬取信息文件夹进行更新操作的更新器
     *
     * @param crawlPath
     */
    public FSDbUpdater(String crawlPath) {
        this.crawlPath = crawlPath;
       
    }
   
    protected String getLastSegmentName(){
        String[] segment_list=new File(crawlPath,"segments").list();
        if(segment_list==null){
            return null;
        }
        String segment_path=null;
        long max=0;
        for(String segment:segment_list){
            long timestamp=Long.valueOf(segment);
            if(timestamp>max){
                max=timestamp;
                segment_path=segment;
            }
        }
        return segment_path;
    }

    /**
     * 备份爬取任务列表
     * @throws IOException
     */
    public void backup() throws IOException {
        LogUtils.getLogger().info("backup "+getCrawlPath());
        File oldfile = new File(crawlPath, Config.old_info_path);
        File currentfile = new File(crawlPath, Config.current_info_path);
        FileUtils.copy(currentfile, oldfile);
    }

    /**
     * 判断更新器是否在上锁状态
     *
     * @return 是否上锁
     * @throws IOException
     */
    public boolean isLocked() throws IOException {
        File lockfile = new File(crawlPath + "/" + Config.lock_path);
        if (!lockfile.exists()) {
            return false;
        }
        String lock = new String(FileUtils.readFile(lockfile), "utf-8");
        return lock.equals("1");
    }

    /**
     * 上锁该更新器
     *
     * @throws IOException
     */
    public void lock() throws IOException {
        FileUtils.writeFile(crawlPath + "/" + Config.lock_path, "1".getBytes("utf-8"));
    }

    /**
     * 解锁该更新器
     *
     * @throws IOException
     */
    public void unlock() throws IOException {
        FileUtils.writeFile(crawlPath+ "/" + Config.lock_path, "0".getBytes("utf-8"));
    }
    // DataFileWriter<CrawlDatum> dataFileWriter;

    private void updateAll(ArrayList<CrawlDatum> datums) throws IOException {
        File currentfile = new File(crawlPath, Config.current_info_path);
        if (!currentfile.getParentFile().exists()) {
            currentfile.getParentFile().mkdirs();
        }
        DbWriter<CrawlDatum> writer = new DbWriter<CrawlDatum>(CrawlDatum.class, currentfile);
        for (CrawlDatum crawldatum : datums) {
            writer.write(crawldatum);
        }
        writer.close();
    }

   

    /**
     * 关闭该更新器
     *
     * @throws IOException
     */
    public void close() throws Exception {
        if(segmentWriter!=null){
            segmentWriter.close();
        }
    }

    /**
     * 将爬取记录和爬取任务列表合并,更新爬取任务列表
     *
     * @param segment_path
     * @throws IOException
     */
    @Override
    public void merge() throws IOException {
        if(segmentName==null){
            segmentName=getLastSegmentName();
        }
        if(segmentName==null){
            return;
        }
       
       
        try {
            backup();    
        } catch (IOException ex) {
            LogUtils.getLogger().info("Exception",ex);
        }
       
        LogUtils.getLogger().info("merge " + getSegmentPath());
       
        File file_fetch = new File(getSegmentPath(), "fetch/info.avro");
        if (!file_fetch.exists()) {
            return;
        }

        File file_current = new File(crawlPath, Config.current_info_path);
        DbReader<CrawlDatum> reader_current = new DbReader<CrawlDatum>(CrawlDatum.class, file_current);
        DbReader<CrawlDatum> reader_fetch = new DbReader<CrawlDatum>(CrawlDatum.class, file_fetch);

        HashMap<String, Integer> indexmap = new HashMap<String, Integer>();

        ArrayList<CrawlDatum> datums_origin = new ArrayList<CrawlDatum>();
        CrawlDatum datum = null;
        while (reader_current.hasNext()) {
            datum = reader_current.readNext();
            datums_origin.add(datum);
            indexmap.put(datum.getUrl(), datums_origin.size() - 1);
        }

        while (reader_fetch.hasNext()) {
            datum = reader_fetch.readNext();
            if (indexmap.containsKey(datum.getUrl())) {
                if (datum.getStatus() == CrawlDatum.STATUS_DB_UNFETCHED) {
                    continue;
                } else {
                    int preindex = indexmap.get(datum.getUrl());
                    datums_origin.set(preindex, datum);
                    indexmap.put(datum.getUrl(), preindex);
                }

            } else {
                datums_origin.add(datum);
                indexmap.put(datum.getUrl(), datums_origin.size() - 1);
            }

        }
        reader_fetch.close();

        File file_parse = new File(getSegmentPath(), "parse_data/info.avro");
        if (file_parse.exists()) {
            DbReader<ParseData> reader_parse = new DbReader<ParseData>(ParseData.class, file_parse);
            ParseData parseresult = null;
            while (reader_parse.hasNext()) {
                parseresult = reader_parse.readNext();
                for (Link link : parseresult.getLinks()) {
                    datum = new CrawlDatum();
                    datum.setUrl(link.getUrl());
                    datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
                    if (indexmap.containsKey(datum.getUrl())) {
                        continue;
                    } else {
                        datums_origin.add(datum);
                        indexmap.put(datum.getUrl(), datums_origin.size() - 1);
                    }
                }
            }
            reader_parse.close();
        }

        reader_current.close();

        updateAll(datums_origin);

    }

   

    @Override
    public SegmentWriter getSegmentWriter() {
        return segmentWriter;
    }

  

    public String getSegmentPath() {
        return crawlPath+"/segments/"+segmentName;
    }

   
  
   
   
 
   
   

  

    public String getCrawlPath() {
        return crawlPath;
    }

    public void setCrawlPath(String crawlPath) {
        this.crawlPath = crawlPath;
    }

    public String getSegmentName() {
        return segmentName;
    }

    public void setSegmentName(String segmentName) {
        this.segmentName = segmentName;
    }

    @Override
    public void clearHistory() {
       
        File file=new File(crawlPath);
        LogUtils.getLogger().info("clear "+file.getAbsolutePath());
        if(file.exists()){
            FileUtils.deleteDir(file);
        }
       
    }

    @Override
    public void initSegmentWriter() throws Exception {
        segmentName=SegmentUtils.createSegmengName();
        segmentWriter=new FSSegmentWriter(crawlPath, getSegmentPath());
    }


   
   
   
}
TOP

Related Classes of cn.edu.hfut.dmic.webcollector.generator.FSDbUpdater

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.