Package org.parosproxy.paros.core.spider

Source Code of org.parosproxy.paros.core.spider.SpiderThread

/*
*
* Paros and its related class files.
*
* Paros is an HTTP/HTTPS proxy for assessing web application security.
* Copyright (C) 2003-2004 Chinotec Technologies Company
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the Clarified Artistic License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* Clarified Artistic License for more details.
*
* You should have received a copy of the Clarified Artistic License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/

package org.parosproxy.paros.core.spider;

import java.io.IOException;
import java.util.List;

import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.parosproxy.paros.network.HttpHeader;
import org.parosproxy.paros.network.HttpMalformedHeaderException;
import org.parosproxy.paros.network.HttpMessage;
import org.parosproxy.paros.network.HttpResponseHeader;
import org.parosproxy.paros.network.HttpStatusCode;


/**
*
* To change the template for this generated type comment go to
* Window - Preferences - Java - Code Generation - Code and Comments
*/
public class SpiderThread extends Thread {

    private static final String[] NEGLECT_SUFFIXES = {"gif", "jpg", "bmp", "mp3", "arj", "doc", "swf", "pdf", "mpg", "wmv", "zip", "exe", "cab", "iso", "avi"};
   
    private Spider parent = null;
    private boolean stop = false;
    private List queue = null;
    private boolean completed = false;
    private Collector collector = null;
    private boolean emptyQueue = false;
   
    /**
     * @return Returns the emptyQueue.
     */
    boolean isEmptyQueue() {
        return emptyQueue;
    }
   
    SpiderThread(Spider parent) {
        this.parent = parent;
        queue = parent.getQueue();
        collector = new Collector(this);
        this.setDaemon(true);
        this.setPriority(Thread.NORM_PRIORITY-2);
    }
   
   
    /**
     * @return Returns the stop.
     */
    boolean isStop() {
        return stop;
    }
    /**
     * @param stop The stop to set.
     */
    void setStop(boolean stop) {
        this.stop = stop;
    }
   
    public void run() {

        QueueItem item = null;
//        while (!isStop() && !queue.isEmpty()) {
       
        // to avoid 1 thread running but other thread exit.
        while (!isStop() && !parent.isAllThreadEmptyQueue()) {

            try {
                synchronized(queue) {
               
                    // get distinct item from queue
                    do {
                        item = null;
                        if (queue.isEmpty()) {
                            try {
                                setEmptyQueue(true);
                                Thread.sleep(500);
                            } catch (InterruptedException ie) {}

                        } else {
                            item = (QueueItem) queue.remove(0);
                            setEmptyQueue(false);
                        }                       
                    } while (!stop && item != null && parent.isInVisitedLink(item.getMessage()));
                }
               
                if (item != null) {
                    parent.SpiderProgress(item);
                    crawl(item.getMessage(), item.getDepth());
                    item.getHistoryReference().delete();
                    try {
                        Thread.sleep(30);
                    } catch (InterruptedException e1) {}
                } else if (!stop) {
                    // no item, waiting for all spider queue empty
                    try {
                        Thread.sleep(500);
                    } catch (InterruptedException e1) {}
                   
                }

            } catch (Exception e) {
                e.printStackTrace();
            }
        }
       
        if (queue.isEmpty()) {
            completed = true;
        }
       
        parent.checkIfAllThreadCompleted();
       
    }

    private void readMsgResponse(HttpMessage msg) throws HttpException, IOException, HttpMalformedHeaderException {
       
        msg.getRequestHeader().setHeader(HttpHeader.IF_MODIFIED_SINCE, null);
        msg.getRequestHeader().setHeader(HttpHeader.IF_NONE_MATCH, null);
        msg.getRequestHeader().setContentLength(msg.getRequestBody().length());
        parent.getHttpSender().sendAndReceive(msg);
        msg.getResponseHeader().setHeader(HttpHeader.TRANSFER_ENCODING, null);           
    }

    private void crawl(HttpMessage msg, int depth) {
       
        Html html = null;

       
        try {
           
            if (isNeglectCrawl(msg)) {
                parent.readURI(msg);
                return;
            }
           
            readMsgResponse(msg);
//            if (msg.getResponseHeader().isEmpty() || msg.getResponseHeader().getStatusCode() == HttpStatusCode.NOT_MODIFIED) {
//                if (!readMsgResponse(msg)) {
//                  return;
//              }
//            }
           
           
            if (!HttpStatusCode.isSuccess(msg.getResponseHeader().getStatusCode())) {
                return;
            }

            if (msg.getResponseHeader().getContentLength() > 200000) {
                msg.setResponseHeader(new HttpResponseHeader());
                msg.getResponseBody().setBody("");
            }

            parent.readURI(msg);

            if (isNeglectResponse(msg.getResponseHeader())) {
                return;
            }
           
            html = new Html(msg.getRequestHeader().getURI(), msg.getResponseBody().toString());
            collector.collect(html, depth);

            // no more response processing needed.  remove from msg to save memory
           
        } catch (Exception e) {
           
            e.printStackTrace();
        } finally {
            msg.setResponseHeader(new HttpResponseHeader());
            msg.getResponseBody().setBody("");
            parent.addVisitedLink(msg);
        }
       
    }
   




    /**
     * Build URI given a base HTML.  Keep absolute if it is.
     * @param html
     * @param link
     * @return
     * @throws URIException
     */
    private URI buildURI(URI base, String link) throws URIException {

        URI uri = null;
        /*
        try {
            uri = new URI(link, true);
            if (uri.isAbsoluteURI()) {
                return uri;
            }
        } catch (URIException e) {}
        */
       
        uri = new URI(base, link, true);
        return uri;
    }
   
   
    void foundURI(HttpMessage msg, String referer, int currentDepth) throws URIException {
        msg.getRequestHeader().setHeader(HttpHeader.REFERER, referer);
        parent.foundURI(msg, currentDepth+1);       
    }

    private boolean isNeglectCrawl(HttpMessage msg) {
        boolean result = false;

       
        URI uri = msg.getRequestHeader().getURI();

        try {
           
            // check if need to skip this URL from config
            if (parent.getSpiderParam().isSkipURL(uri)) {
                return true;
            }
           
            // check if suffix relevant
            if (uri.getPath() != null) {
                String path = uri.getPath().toLowerCase();
                for (int i=0; i<NEGLECT_SUFFIXES.length; i++) {
                    String suffix = "." + NEGLECT_SUFFIXES[i];
                    if (path.endsWith(suffix)) {
                        return true;
                    }
                }
            }
           
        } catch (Exception e) {}
       
        return result;
       
    }

    private boolean isNeglectResponse(HttpResponseHeader resHeader) {
       
        if (!HttpStatusCode.isSuccess(resHeader.getStatusCode())) {
            return true;
        }
       
        if (resHeader.isImage()) {
            return true;
        }

        if (resHeader.isText()) {
            return false;
        }

        // do not process - not html file
        if (resHeader.getContentLength() > 200000) {
            return true;
        }       
       
        return false;
    }
   
    /**
     * @return Returns the completed.
     */
    public boolean isCompleted() {
        return completed;
    }
    /**
     * @param emptyQueue The emptyQueue to set.
     */
    private void setEmptyQueue(boolean emptyQueue) {
        this.emptyQueue = emptyQueue;
    }
   
    Spider getParent() {
        return parent;
    }
}
TOP

Related Classes of org.parosproxy.paros.core.spider.SpiderThread

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.