Package org.archive.modules

Examples of org.archive.modules.CrawlURI


        return e;
    }

    public void testFacebookScrollExample() throws Exception {
        UURI testUuri = UURIFactory.getInstance(TEST_URI);
        CrawlURI testUri = new CrawlURI(testUuri, null, null, LinkContext.NAVLINK_MISC);
        StringBuilder content = new StringBuilder();
        for (String chunk: TEST_CONTENT_CHUNKS) {
            content.append(chunk);
        }
        Recorder recorder = createRecorder(content.toString(), "UTF-8");
        testUri.setContentType("text/html");
        testUri.setRecorder(recorder);
        testUri.setContentSize(content.length());
       
        extractor.process(testUri);
       
        for (String expectedLinkString: EXPECTED_OUTLINKS) {
            CrawlURI expectedLink = testUri.createCrawlURI(
                    UURIFactory.getInstance(expectedLinkString),
                    HTMLLinkContext.INFERRED_MISC, Hop.INFERRED);
            assertTrue(testUri.getOutLinks().contains(expectedLink));
        }
    }
View Full Code Here


   
    final String CONTENT_DIGEST_SCHEME = "sha1:";
    WbmPersistLoadProcessor t = new WbmPersistLoadProcessor();
    t.setHttpClient(client);
    t.setContentDigestScheme(CONTENT_DIGEST_SCHEME);
    CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://archive.org/"));
   
    // put history entry newer than being loaded (i.e. loaded history entry will not be used for FetchHistoryProcessor
    // check below.
    long expected_ts = DateUtils.parse14DigitDate(TestNormalHttpResponse.EXPECTED_TS).getTime();
    Map<String, Object>[] fetchHistory = (Map[])curi.getData().get(RecrawlAttributeConstants.A_FETCH_HISTORY);
    if (fetchHistory == null) {
      fetchHistory = new HashMap[2];
      curi.getData().put(RecrawlAttributeConstants.A_FETCH_HISTORY, fetchHistory);
    }
    final byte[] digestValue0 = sha1Digest("0");
    final byte[] digestValue1 = sha1Digest("1");
    fetchHistory[0] = new HashMap<String, Object>();
    fetchHistory[0].put(FetchHistoryHelper.A_TIMESTAMP, expected_ts + 2000);
    fetchHistory[0].put(CoreAttributeConstants.A_FETCH_BEGAN_TIME, expected_ts + 2000);
    fetchHistory[0].put(RecrawlAttributeConstants.A_CONTENT_DIGEST,
        CONTENT_DIGEST_SCHEME + Base32.encode(digestValue0));
    fetchHistory[1] = new HashMap<String, Object>();
    fetchHistory[1].put(FetchHistoryHelper.A_TIMESTAMP, expected_ts - 2000);
    fetchHistory[1].put(RecrawlAttributeConstants.A_CONTENT_DIGEST,
        CONTENT_DIGEST_SCHEME + Base32.encode(digestValue1));
   
    ProcessResult result = t.innerProcessResult(curi);
    assertEquals("result is PROCEED", ProcessResult.PROCEED, result);
   
    // newly loaded history entry should fall in between two existing entries (index=1)
    Map<String, Object> history = getFetchHistory(curi, 1);
    assertNotNull("history", history);
    String hash = (String)history.get(RecrawlAttributeConstants.A_CONTENT_DIGEST);
    assertEquals("CONTENT_DIGEST", CONTENT_DIGEST_SCHEME+TestNormalHttpResponse.EXPECTED_HASH, hash);
   
    Long ts = (Long)history.get(FetchHistoryHelper.A_TIMESTAMP);
    assertNotNull("ts is non-null", ts);
    assertEquals("'ts' has expected timestamp", expected_ts, ts.longValue());

    // Check compatibility with FetchHistoryProcessor.
    // TODO: This is not testing WbmPersistLoadProcessor - only testing stub fetchHistory
    // setup above (OK as long as it matches WbmPersistLoadProcessor). We need a separate
    // test method.
    curi.setFetchStatus(200);
    curi.setFetchBeginTime(System.currentTimeMillis());
    // FetchHistoryProcessor once failed for a revisit case. We'd need to test other cases
    // too (TODO).
    curi.setContentDigest("sha1", digestValue0);
    FetchHistoryProcessor fhp = new FetchHistoryProcessor();
    fhp.process(curi);
  }
View Full Code Here

  }
 
  public void testInnerProcessResultSingleShotWithRealServer() throws Exception {
    WbmPersistLoadProcessor t = new WbmPersistLoadProcessor();
    //CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://archive.org/"));
    CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.mext.go.jp/null.gif"));
    ProcessResult result = t.innerProcessResult(curi);
    Map<String, Object> history = getFetchHistory(curi, 0);
    assertNotNull("getFetchHistory returns non-null", history);
    String hash = (String)history.get(RecrawlAttributeConstants.A_CONTENT_DIGEST);
    assertNotNull("CONTENT_DIGEST is non-null", hash);
View Full Code Here

      this.uri = uri;
    }
    @Override
    public void run() {
      try {
      CrawlURI curi = new CrawlURI(UURIFactory.getInstance(this.uri));
      p.innerProcessResult(curi);
      //System.err.println(curi.toString());
      } catch (Exception ex) {
      ex.printStackTrace();
      }
View Full Code Here

    }

   
    @Override
    protected ProcessResult innerProcessResult(CrawlURI puri) {
        CrawlURI curi = (CrawlURI)puri;
        if (considerDnsPreconditions(curi)) {
            return ProcessResult.FINISH;
        }

        // make sure we only process schemes we understand (i.e. not dns)
        String scheme = curi.getUURI().getScheme().toLowerCase();
        if (! (scheme.equals("http") || scheme.equals("https"))) {
            logger.fine("PolitenessEnforcer doesn't understand uri's of type " +
                scheme + " (ignoring)");
            return ProcessResult.PROCEED;
        }

        if (considerRobotsPreconditions(curi)) {
            return ProcessResult.FINISH;
        }

        if (!curi.isPrerequisite() && credentialPrecondition(curi)) {
            return ProcessResult.FINISH;
        }

        // OK, it's allowed
View Full Code Here

                    // queue has gone 'in process'
                    readyQ.considerActive();
                    readyQ.setWakeTime(0); // clear obsolete wake time, if any
                   
                    // we know readyQ is not empty (getCount()!=0) so peek() shouldn't return null
                    CrawlURI readyQUri = readyQ.peek(this);
                    // see HER-1973 and HER-1946
                    sheetOverlaysManager.applyOverlaysTo(readyQUri);
                    try {
                        KeyedProperties.loadOverridesFrom(readyQUri);
                        readyQ.setSessionBudget(getBalanceReplenishAmount());
                        readyQ.setTotalBudget(getQueueTotalBudget());
                    } finally {
                        KeyedProperties.clearOverridesFrom(readyQUri);
                    }
                   
                    if (readyQ.isOverSessionBudget()) {
                        deactivateQueue(readyQ);
                        readyQ.makeDirty();
                        readyQ = null;
                        continue;
                    }
                    if (readyQ.isOverTotalBudget()) {
                        retireQueue(readyQ);
                        readyQ.makeDirty();
                        readyQ = null;
                        continue;
                    }
                } while (readyQ == null);
               
                if (readyQ == null) {
                    // no queues left in ready or readiable
                    break findauri;
                }
          
                returnauri: while(true) { // loop left by explicit return or break on empty
                    CrawlURI curi = null;
                    curi = readyQ.peek(this);  
                    if(curi == null) {
                        // should not reach
                        logger.severe("No CrawlURI from ready non-empty queue "
                                + readyQ.classKey + "\n"
                                + readyQ.shortReportLegend() + "\n"
                                + readyQ.shortReportLine() + "\n");
                        break returnauri;
                    }
                   
                    // from queues, override names persist but not map source
                    curi.setOverlayMapsSource(sheetOverlaysManager);
                    // TODO: consider optimizations avoiding this recalc of
                    // overrides when not necessary
                    sheetOverlaysManager.applyOverlaysTo(curi);
                    // check if curi belongs in different queue
                    String currentQueueKey;
                    try {
                        KeyedProperties.loadOverridesFrom(curi);
                        currentQueueKey = getClassKey(curi);
                    } finally {
                        KeyedProperties.clearOverridesFrom(curi);
                    }
                    if (currentQueueKey.equals(curi.getClassKey())) {
                        // curi was in right queue, emit
                        noteAboutToEmit(curi, readyQ);
                        return curi;
                    }
                    // URI's assigned queue has changed since it
                    // was queued (eg because its IP has become
                    // known). Requeue to new queue.
                    // TODO: consider synchronization on readyQ
                    readyQ.dequeue(this,curi);
                    doJournalRelocated(curi);
                    curi.setClassKey(currentQueueKey);
                    decrementQueuedCount(1);
                    curi.setHolderKey(null);
                    sendToQueue(curi);
                    if(readyQ.getCount()==0) {
                        // readyQ is empty and ready: it's exhausted
                        // release held status, allowing any subsequent
                        // enqueues to again put queue in ready
View Full Code Here

            synchronized(futureUris) {
                Iterator<CrawlURI> iter =
                    futureUris.headMap(System.currentTimeMillis())
                        .values().iterator();
                while(iter.hasNext()) {
                    CrawlURI curi = iter.next();
                    curi.setRescheduleTime(-1); // unless again set elsewhere
                    iter.remove();
                    futureUriCount.decrementAndGet();
                    receive(curi);
                }
            }
View Full Code Here

        return extractor;
    }

    private CrawlURI setupURI(String url) throws MalformedURLException, IOException {
        UURI uuri = UURIFactory.getInstance(url);
        CrawlURI curi = new CrawlURI(uuri, null, uuri, LinkContext.NAVLINK_MISC);

        URLConnection conn = new URL(url).openConnection();
        conn.setConnectTimeout(10000);
        conn.setReadTimeout(30000);
        InputStream in = conn.getInputStream();

        Recorder recorder = Recorder.wrapInputStreamWithHttpRecord(
            TestUtils.tmpDir(), this.getClass().getName(), in, null);
        logger.info("got recorder for " + url);

        curi.setContentSize(recorder.getRecordedInput().getSize());
        curi.setContentType("application/x-shockwave-flash");
        curi.setFetchStatus(200);
        curi.setRecorder(recorder);

        return curi;
    }
View Full Code Here

//        testUrls.put("http://wayback.archive-it.org/1094/20080923040243/http://www.dreamingmethods.com/uploads/dm_archive/mainsite/downloads/flash/clearance/loader.swf", "clearance_intro.swf");

        for (String url : testUrls.keySet()) {
            logger.info("testing " + url);

            CrawlURI curi;
            try {
                curi = setupURI(url);
            } catch (IOException e) {
                logger.severe("unable to open url, skipping: " + e);
                continue;
            }

            long startTime = System.currentTimeMillis();
            this.extractor.extract(curi);
            long elapsed = System.currentTimeMillis() - startTime;
            logger.info(this.extractor.getClass().getSimpleName() + " took "
                    + elapsed + "ms to process " + url);

            boolean foundIt = false;
            for (CrawlURI link : curi.getOutLinks()) {
                logger.info("found link: " + link);
                foundIt = foundIt || link.getURI().endsWith(testUrls.get(url));
            }

            assertTrue("failed to extract link \"" + testUrls.get(url)
View Full Code Here

//         testUrls.put("http://wayback.archive-it.org/317/20061129141640/http://www.ine.gov.ve/secciones/modulos/Miranda/sMiranda.swf", "aspectosfisicos.asp?Codigo=Nacimientos&titulo=Nacimientos%20vivos%20registrados%20por%20a%C3%B1o,%20seg%C3%BAn%20municipio%20de%20residencia%20habitual%20de%20la%20madre,%201999-2002&Fuente=Fuente:%20Prefecturas%20y%20Jefaturas%20Civiles&cod_ent=13&nvalor=2_2&seccion=2");
//         testUrls.put("http://wayback.archive-it.org/317/20061129141640/http://www.ine.gov.ve/secciones/modulos/Tachira/sTachira.swf", "aspectosfisicos.asp?Codigo=Nacimientos&titulo=Nacimientos%20vivos%20registrados%20por%20a%C3%B1o,%20seg%C3%BAn%20municipio%20de%20residencia%20habitual%20de%20la%20madre,%201999-2002&Fuente=Fuente:%20Prefecturas%20y%20Jefaturas%20Civiles&cod_ent=13&nvalor=2_2&seccion=2");

        for (String url : testUrls.keySet()) {
            logger.info("testing " + url);
            CrawlURI curi;
            try {
                curi = setupURI(url);
            } catch (IOException e) {
                logger.severe("unable to open url, skipping: " + e);
                continue;
            }

            long startTime = System.currentTimeMillis();
            this.extractor.extract(curi);
            long elapsed = System.currentTimeMillis() - startTime;
            logger.info(this.extractor.getClass().getSimpleName() + " took "
                    + elapsed + "ms to process " + url);

            boolean foundIt = false;
            for (CrawlURI link : curi.getOutLinks()) {
                logger.info("found link: " + link);
                foundIt = foundIt || link.getURI().endsWith(testUrls.get(url));
            }

            if (!foundIt)
View Full Code Here

TOP

Related Classes of org.archive.modules.CrawlURI

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.