Package net.sf.jftp.tools

Source Code of net.sf.jftp.tools.Getter

/*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.

* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/
package net.sf.jftp.tools;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.Socket;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.StringTokenizer;
import java.util.Vector;

import net.sf.jftp.system.LocalIO;
import net.sf.jftp.system.logging.Log;


public class FileSearch
{

    private int currentDepth = 0;
    private Hashtable checked = new Hashtable();
    public static boolean quiet = true;
    public static boolean ultraquiet = false;
   
    String localDir = ".";
    int MAX = 999999;
    int MIN_TERM = 1;
    int MIN_FACTOR = 1;
    boolean LOAD = false;
    String[] typeArray = { "" };
    String[] termArray = { "" };
    String[] optArray = { "" };
    String[] ignoreArray = { "" };
    String[] scanArray = { "" };
   

    public static void main(String argv[]) {
        String[] typeArray = { ".gz", ".bz2", ".zip", ".rar" };
        String[] termArray = { "linux", "kernel" };
        String[] optArray = { "download", "file", "mirror", "location" };
        String[] ignoreArray = { ".gif", ".jpg", ".png", ".swf", ".jar", ".class", ".google." };
        String[] scanArray = { ".html", ".htm", "/", ".jsp", ".jhtml", ".phtml", ".asp", ".xml", ".js", ".cgi" };
        String url = "http://www.google.de/search?hl=de&q=";
       
        for(int i=0; i<termArray.length; i++) {
          url += termArray[i]+"+";
        }
       
      FileSearch search = new FileSearch();
       
      search.typeArray = typeArray;
      search.termArray = termArray;
      search.optArray = optArray;
      search.ignoreArray = ignoreArray;
      search.scanArray = scanArray;
      search.MIN_TERM = 1;
     
      search.spider(url);
     
    }

    private void spider(String url)
    {
        try
        {
          if(url.indexOf("/") < 0)
          {
            url = url + "/";
          }
         
        url = clear(url);
       
          Log.out(">>> URL: "+url);
          Log.out(">>> Scanning for ");
         
          for(int i = 0; i < typeArray.length; i++)
          {
            Log.out(typeArray[i] + " ");
          }
         
          Log.out("");
         

            Log.out("Fetching initial HTML file...");

            Getter urlGetter = new Getter(localDir);
            urlGetter.fetch(url, true);

            Log.out("Searching for links...");
            LocalIO.pause(500);

            crawl(url);
        }
        catch(Exception ex)
        {
            ex.printStackTrace();
        }
    }

    private String clear(String url)
    {
        int idx = url.indexOf("http://");

        if(idx >= 0)
        {
            url = url.substring(7);
        }

        return url;
    }

    private Vector addVector(Vector v, Vector x)
    {
        Enumeration e = x.elements();

        while(e.hasMoreElements())
        {
            String next = (String) e.nextElement();
            v.add(next);
        }

        return v;
    }

    private int rate(String content) {
      int score = 0;
     
      for(int i=0; i<termArray.length; i++) {
        if(content.indexOf(termArray[i]) >= 0) score += 3;
      }     
     
      if(score < MIN_TERM) return 0;
  
      for(int i=0; i<optArray.length; i++) {
        if(content.indexOf(optArray[i]) >= 0) score++;
      }
     
      return score;
    }
   
    private int checkForResult(String url) {
      //for(int i=0; i<typeArray.length; i++) {
      //  if(url.indexOf(typeArray[i]) >= 0) return 2;
      //}     
     
      for(int i=0; i<ignoreArray.length; i++) {
        if(url.indexOf(ignoreArray[i]) >= 0) return -1
      }
     
      if(!checkForScanableUrl(url)) return -1;
     
      return 1;
    }
   
    private boolean checkForScanableUrl(String url) {
     
      if(checked.containsKey(url)) {
        return false;
      }
      else {
        checked.put(url, "");
      }
     
      if(url.indexOf("/") > 0) {
        String tmp = url.substring(0, url.indexOf("/"));
      }
     
      for(int i=0; i<scanArray.length; i++) {
        if(url.endsWith(scanArray[i])) return true;
      }     

      return false;
    }
   
    private void crawl(String url) throws Exception
    {
        url = clear(url);

        int urlRating = checkForResult(url);
        if(!quiet) Log.out("URL-Rating: "+url+" -> "+urlRating+" @"+currentDepth);
       
        if(urlRating > 0) {
          //System.out.println("!!!");
          //Getter.chill(1000);
          //System.exit(0);
        } else if(urlRating < 0 && currentDepth > 0) {
          if(!quiet) Log.out("SKIP "+url);    
          return;
        }


        Getter urlGetter = new Getter(localDir);
        String content = urlGetter.fetch(url);
       
        int factor = rate(content);
        if(!quiet) Log.out("Content-Rating: "+url+" -> "+factor+" @"+currentDepth);
       
        if(factor < MIN_FACTOR) {
          if(!quiet) Log.out("DROP: "+url);
          return;
        }
       
        if(!ultraquiet) Log.out("Url: "+url+" -> "+urlRating+":"+factor+"@"+currentDepth);

        Vector m = sort(content, url.substring(0, url.lastIndexOf("/")),
                              "href=\"");
        m = addVector(m,
                      sort(content, url.substring(0, url.lastIndexOf("/")),
                                 "src=\""));
        m = addVector(m,
                      sort(content, url.substring(0, url.lastIndexOf("/")),
                                 "HREF=\""));
        m = addVector(m,
                      sort(content, url.substring(0, url.lastIndexOf("/")),
                                 "SRC=\""));

        Enumeration links = m.elements();

        while(links.hasMoreElements())
        {

            String next = (String) links.nextElement();
           
            if(!quietLog.out("PROCESS: " + next);
            boolean skip = false;
           
            while(!skip) {
              for(int i = 0; i < typeArray.length; i++)
              {
                if(next.endsWith(typeArray[i]) ||
                    typeArray[i].trim().equals("*"))
                {
                  Log.out("HIT: "+url+" -> "+next);
                  //Getter.chill(2000);
                 
                  if(!LOAD || !checkForScanableUrl(url)) continue;
                 
                  int x = next.indexOf("/");
                 
                  if((x > 0) && (next.substring(0, x).indexOf(".") > 0))
                  {
                    Getter urlGetter2 = new Getter(localDir);
                    urlGetter2.fetch(next, false);
                   
                    continue;
                  }
                }
              }
             
              skip = true;
            }

            if(currentDepth < MAX)
            {

                int x = next.indexOf("/");

                if((x > 0) && (next.substring(0, x).indexOf(".") > 0))
                {
                    currentDepth++;
                    crawl(next);
                    currentDepth--;
                }
            }
        }
    }

    private Vector sort(String content, String url, String index)
    {
        Vector res = new Vector();
        int wo = 0;

        while(true)
        {
            wo = content.indexOf(index);

            if(wo < 0)
            {
                return res;
            }

            content = content.substring(wo + index.length());

            String was = content.substring(0, content.indexOf("\""));

            was = createAbsoluteUrl(was, url);
            res.add(was);
            if(!quiet) Log.out("ADD: " + was);
        }
    }

    private String[] check(String auswahl)
    {
        StringTokenizer tokenizer = new StringTokenizer(auswahl, "-", false);
        String[] strArr = new String[tokenizer.countTokens()];
        int tmp = 0;

        while(tokenizer.hasMoreElements())
        {
            strArr[tmp] = (String) tokenizer.nextElement();
            tmp++;
        }

        return strArr;
    }

    private String createAbsoluteUrl(String newLink, String baseUrl)
    {
        newLink = clear(newLink);

        if(newLink.startsWith(baseUrl))
        {
            return newLink;
        }

        if(newLink.startsWith("/") && (baseUrl.indexOf("/") > 0))
        {
            newLink = baseUrl.substring(0, baseUrl.indexOf("/")) + newLink;
        }
        else if(newLink.startsWith("/") && (baseUrl.indexOf("/") < 0))
        {
            newLink = baseUrl + newLink;
        }
        else if((newLink.indexOf(".") > 0))
        {
            int idx = newLink.indexOf("/");
            String tmp = "";

            if(idx >= 0)
            {
                tmp = newLink.substring(0, idx);
            }

            if((tmp.indexOf(".") > 0))
            {
                return clear(newLink);
            }

            if(baseUrl.endsWith("/"))
            {
                newLink = baseUrl + newLink;
            }
            else
            {
                newLink = baseUrl + "/" + newLink;
            }
        }

        //Log.out("-> " + newLink);

        return newLink;
    }

}


class Getter
{
    private String localDir = null;

    public Getter(String localDir)
    {
        this.localDir = localDir;
    }

    public String fetch(String url)
    {
        try
        {
            String host = url.substring(0, url.indexOf("/"));
            String wo = url.substring(url.indexOf("/"));
            String result = "";

            //Log.out(">> " + host + wo);

            Socket deal = new Socket(host, 80);
            deal.setSoTimeout(5000);

            BufferedWriter out = new BufferedWriter(new OutputStreamWriter(deal.getOutputStream()));
            BufferedReader in = new BufferedReader(new InputStreamReader(deal.getInputStream()));

            out.write("GET http://" + url + " HTTP/1.0\n\n");
            out.flush();

            int len = 0;

            while(!in.ready() && (len < 5000))
            {
                chill(100);
                len += 100;
            }

            while(in.ready())
            {
                result = result + in.readLine();
            }

            out.close();
            in.close();

            return result;
        }
        catch(Exception ex)
        {
          if(!FileSearch.quiet) ex.printStackTrace();
        }

        return "";
    }

    public void fetch(String url, boolean force)
    {
        try
        {
            String host = url.substring(0, url.indexOf("/"));
            String wo = url.substring(url.indexOf("/"));
            String result = "";

            if(!FileSearch.quiet) Log.debug(">>> " + host + wo);

            //JFtp.statusP.jftp.ensureLogging();
            File d = new File(localDir);
            d.mkdir();

            File f = new File(localDir + wo.substring(wo.lastIndexOf("/") + 1));

            if(f.exists() && !force)
            {
              if(!FileSearch.quiet) Log.debug(">>> file already exists...");

                return;
            }
            else
            {
                f.delete();
            }

            Socket deal = new Socket(host, 80);
            BufferedWriter out = new BufferedWriter(new OutputStreamWriter(deal.getOutputStream()));
            DataInputStream in = new DataInputStream(new BufferedInputStream(deal.getInputStream()));

            BufferedOutputStream localOut = new BufferedOutputStream(new FileOutputStream(localDir +
                                                                                        wo.substring(wo.lastIndexOf("/") +
                                                                                                     1)));

            byte[] alu = new byte[2048];

            out.write("GET http://" + url + " HTTP/1.0\n\n");
            out.flush();

            boolean line = true;
            boolean bin = false;

            while(true)
            {
                chill(10);

                String tmp = "";

                while(line)
                {
                    String x = in.readLine();

                    if(x == null)
                    {
                        break;
                    }

                    tmp += (x + "\n");

                    if(x.equals(""))
                    {
                        line = false;
                    }
                }

                int x = in.read(alu);

                if(x == -1)
                {
                    if(line)
                    {
                        localOut.write(tmp.getBytes(), 0, tmp.length());
                    }

                    out.close();
                    in.close();
                    localOut.flush();
                    localOut.close();

                    return;
                }
                else
                {
                    localOut.write(alu, 0, x);
                }
            }
        }
        catch(Exception ex)
        {
          if(!FileSearch.quiet) ex.printStackTrace();
        }
    }

    public static void chill(int time)
    {
        try
        {
            Thread.sleep(time);
        }
        catch(Exception ex)
        {
        }
    }
}
TOP

Related Classes of net.sf.jftp.tools.Getter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.