Package org.webharvest.runtime

Examples of org.webharvest.runtime.Scraper


            ide.setTabIcon(this, ResourceManager.SMALL_RUN_ICON);
        } else if ( this.scraper == null || this.scraper.getStatus() != Scraper.STATUS_RUNNING ) {
            boolean ok = refreshTree();
            if (ok) {
                Settings settings = ide.getSettings();
                this.scraper = new Scraper(this.scraperConfiguration, settings.getWorkingPath());
                this.scraper.addVariablesToContext(initParams);
                if ( settings.isProxyEnabled() ) {
                    HttpClientManager httpClientManager = scraper.getHttpClientManager();

                    int proxyPort = settings.getProxyPort();
View Full Code Here


//        PropertyConfigurator.configure(props);

//        ScraperConfiguration config = new ScraperConfiguration("c:/temp/scrapertest/configs/test2.xml");
        ScraperConfiguration config = new ScraperConfiguration("c:/temp/scrapertest/dddd.xml");
//        ScraperConfiguration config = new ScraperConfiguration( new URL("http://localhost/scripts/test/sample8.xml") );
        Scraper scraper = new Scraper(config, "c:/temp/scrapertest/");

        scraper.setDebug(true);

        long startTime = System.currentTimeMillis();
        scraper.execute();
        System.out.println("time elapsed: " + (System.currentTimeMillis() - startTime));
    }
View Full Code Here

                config = new ScraperConfiguration( new URL(configFilePath) );
            } else {
                config = new ScraperConfiguration(configFilePath);
            }

            Scraper scraper = new Scraper(config, workingDir);

            String isDebug = (String) params.get("debug");
            if ( CommonUtil.isBooleanTrue(isDebug) ) {
                scraper.setDebug(true);
            }

            String proxyHost = (String) params.get("proxyhost");
            if ( proxyHost != null && !"".equals(proxyHost)) {
                String proxyPort = (String) params.get("proxyport");
                if ( proxyPort != null && !"".equals(proxyPort) ) {
                    int port = Integer.parseInt(proxyPort);
                    scraper.getHttpClientManager().setHttpProxy(proxyHost, port);
                } else {
                    scraper.getHttpClientManager().setHttpProxy(proxyHost);
                }
            }

            String proxyUser = (String) params.get("proxyuser");
            if ( proxyUser != null && !"".equals(proxyUser) ) {
                String proxyPassword = (String) params.get("proxypassword");
                String proxyNTHost = (String) params.get("proxynthost");
                String proxyNTDomain = (String) params.get("proxyntdomain");
                scraper.getHttpClientManager().setHttpProxyCredentials(proxyUser, proxyPassword, proxyNTHost, proxyNTDomain);
            }

            // adds initial variables to the scraper's content, if any
            Map caseSensitiveParams = getArgValue(args, true);
            Iterator iterator = caseSensitiveParams.entrySet().iterator();
            while (iterator.hasNext()) {
                Map.Entry entry = (Map.Entry) iterator.next();
                String key = (String) entry.getKey();
                if (key.startsWith("#")) {
                    String varName = key.substring(1);
                    if (varName.length() > 0) {
                        scraper.addVariableToContext(varName, entry.getValue());
                    }
                }
            }

            scraper.execute();
        }
    }
View Full Code Here

    public void executeConfig()
    {
        try
        {
            ScraperConfiguration config = new ScraperConfiguration(configFile);
            Scraper scraper = new Scraper(config,resFile);
            scraper.setDebug(true);
            scraper.execute();
        }
        catch (FileNotFoundException ex)
        {
          
        }
View Full Code Here

    private void scrape() throws CarbonException {
        //todo need to add proxy info
        InputStream in = new ByteArrayInputStream(config.toString().getBytes());
        InputSource inputSource = new InputSource(in);
        ScraperConfiguration scraperConfiguration = new ScraperConfiguration(inputSource);
        Scraper scraper = new Scraper(scraperConfiguration, "");
        // Execute the scraper config
        scraper.execute();
        scraperContext = scraper.getContext();
    }
View Full Code Here

        super(dataService, configId, DBConstants.DataSourceTypes.WEB, properties);
        this.webHarvestConfigPath = this.getProperty("web_harvest_config");
    }

    public Scraper getScraperConfig() throws DataServiceFault {
        Scraper scraper;
        ScraperConfiguration scraperConfiguration;
        try {
            /* For the given file path of the web harvest configuration */
            if (!webHarvestConfigPath.trim().startsWith("<config>")) {
                scraperConfiguration = new ScraperConfiguration(webHarvestConfigPath);
            } else {
                /* If the Web harvest configuration has provided */
                InputStream in = new ByteArrayInputStream(webHarvestConfigPath.getBytes());
                InputSource inputSource = new InputSource(in);
                scraperConfiguration = new ScraperConfiguration(inputSource);
            }
            scraper = new Scraper(scraperConfiguration, "")
            return scraper;
        } catch (FileNotFoundException e) {
            throw new DataServiceFault(e, "Error in reading web harvest configuration");
        }
    }
View Full Code Here

    }

    /*executing the web scraper*/
    public Variable getScrapedResult(String queryVariable) throws DataServiceFault {
        try {
            Scraper scraper = getScraperConfig();
            scraper.execute();
            return (Variable) scraper.getContext().get(queryVariable);
        } catch (Exception e) {
            throw new DataServiceFault(e, "Error in Scraper Execution");
        }

    }
View Full Code Here

    }

    @Override
    public boolean isActive() {
        try {
            Scraper scraper = getScraperConfig();
            return scraper != null;
        } catch (Exception e) {
          log.error("Error in checking Web config availability", e);
            return false;
        }
View Full Code Here

TOP

Related Classes of org.webharvest.runtime.Scraper

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.