Examples of UURI


Examples of org.archive.net.UURI

        return add2(uri, max, dest, context, hop);
    }

    public static void add(CrawlURI uri, int max, String newUri,
            LinkContext context, Hop hop) throws URIException {
        UURI dest = UURIFactory.getInstance(newUri);
        add2(uri, max, dest, context, hop);
    }
View Full Code Here

Examples of org.archive.net.UURI

        }
    }

    protected void addHeaderLink(CrawlURI curi, String headerName, String url) {
        try {
            UURI dest = UURIFactory.getInstance(curi.getUURI(), url);
            LinkContext lc = HTMLLinkContext.get(headerName+":");
            addOutlink(curi, dest.toString(), lc, Hop.REFER);
            numberOfLinksExtracted.incrementAndGet();
        } catch (URIException e) {
            logUriError(e, curi.getUURI(), url);
        }
    }
View Full Code Here

Examples of org.archive.net.UURI

        return e;
    }

    public void testFacebookScrollExample() throws Exception {
        UURI testUuri = UURIFactory.getInstance(TEST_URI);
        CrawlURI testUri = new CrawlURI(testUuri, null, null, LinkContext.NAVLINK_MISC);
        StringBuilder content = new StringBuilder();
        for (String chunk: TEST_CONTENT_CHUNKS) {
            content.append(chunk);
        }
View Full Code Here

Examples of org.archive.net.UURI

        // initExtractor(extractor);
        return extractor;
    }

    private CrawlURI setupURI(String url) throws MalformedURLException, IOException {
        UURI uuri = UURIFactory.getInstance(url);
        CrawlURI curi = new CrawlURI(uuri, null, uuri, LinkContext.NAVLINK_MISC);

        URLConnection conn = new URL(url).openConnection();
        conn.setConnectTimeout(10000);
        conn.setReadTimeout(30000);
View Full Code Here

Examples of org.archive.net.UURI

                // other HREFs treated as links
                processLink(curi, attrValue, context);
            }
            if ("base".equals(elementName)) {
                try {
                    UURI base = UURIFactory.getInstance(attrValue);
                    curi.setBaseURI(base);
                } catch (URIException e) {
                    logUriError(e, curi.getUURI(), attrValue);
                }
            }
        }
        // ACTION
        if (((attr = attributes.get("action")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            if (!ignoreFormActions) {
                CharSequence context = elementContext(elementName, attr
                        .getKey());
                processLink(curi, attrValue, context);
            }
        }
        // ON_
        if ((attrList = findOnAttributes(attributes)).size() != 0) {
            for (Iterator<Attribute> attrIter = attrList.iterator(); attrIter.hasNext();) {
                attr = (Attribute) attrIter.next();
                CharSequence valueSegment = attr.getValueSegment();
                if (valueSegment != null)
                    processScriptCode(curi, valueSegment);

            }
        }
        // SRC atc.
        if ((((attr = attributes.get("src")) != null)
                || ((attr = attributes.get("lowsrc")) != null)
                || ((attr = attributes.get("background")) != null)
                || ((attr = attributes.get("cite")) != null)
                || ((attr = attributes.get("longdesc")) != null)
                || ((attr = attributes.get("usemap")) != null)
                || ((attr = attributes.get("profile")) != null)
                || ((attr = attributes.get("datasrc")) != null)) &&
                   ((attrValue = attr.getValue()) != null)) {

            final Hop hopType;
            CharSequence context = elementContext(elementName, attr.getKey());

            if (!framesAsEmbeds
                    && ("frame".equals(elementName) || "iframe"
                            .equals(elementName)))
                hopType = Hop.NAVLINK;
            else
                hopType = Hop.EMBED;

            processEmbed(curi, attrValue, context, hopType);
        }
        // CODEBASE
        if (((attr = attributes.get("codebase")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            codebase = StringEscapeUtils.unescapeHtml(attrValue);
            CharSequence context = elementContext(elementName, attr.getKey());
            processEmbed(curi, codebase, context);
        }
        // CLASSID DATA
        if ((((attr = attributes.get("classid")) != null)
                || ((attr = attributes.get("data")) != null)) &&
                   ((attrValue = attr.getValue()) != null)) {
            if (resources == null)
                resources = new ArrayList<String>();
            resources.add(attrValue);
        }
        // ARCHIVE
        if (((attr = attributes.get("archive")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            if (resources == null)
                resources = new ArrayList<String>();
            String[] multi = TextUtils.split(WHITESPACE, attrValue);
            for (int i = 0; i < multi.length; i++) {
                resources.add(multi[i]);
            }
        }
        // CODE
        if (((attr = attributes.get("code")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            if (resources == null)
                resources = new ArrayList<String>();
            // If element is applet and code value does not end with
            // '.class' then append '.class' to the code value.
            if (APPLET.equals(elementName) && !attrValue.endsWith(CLASSEXT)) {
                resources.add(attrValue + CLASSEXT);
            } else {
                resources.add(attrValue);
            }
        }
        // VALUE
        if (((attr = attributes.get("value")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            CharSequence valueContext = elementContext(elementName, attr.getKey());
            if("PARAM".equalsIgnoreCase(elementName)
                    && "flashvars".equalsIgnoreCase(attributes.get("name").getValue())) {
                // special handling for <PARAM NAME='flashvars" VALUE="">
                String queryStringLike = attrValue.toString();
                // treat value as query-string-like "key=value[;key=value]*" pairings
                considerQueryStringValues(curi, queryStringLike, valueContext,Hop.SPECULATIVE);
            } else {
                // regular VALUE handling
                if (overlyEagerLinkDetection) {
                    considerIfLikelyUri(curi,attrValue,valueContext,Hop.NAVLINK);
                }
            }
        }
        // STYLE
        if (((attr = attributes.get("style")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            // STYLE inline attribute
            // then, parse for URIs
            numberOfLinksExtracted.addAndGet(ExtractorCSS.processStyleCode(
                    this, curi, attrValue));
        }
       
        // FLASHVARS
        if (((attr = attributes.get("flashvars")) != null) &&
                ((attrValue = attr.getValue()) != null)) {
            // FLASHVARS inline attribute
            CharSequence valueContext = elementContext(elementName, attr.getKey());
            considerQueryStringValues(curi, attrValue, valueContext,Hop.SPECULATIVE);
       }

        // handle codebase/resources
        if (resources == null)
            return;

        Iterator<String> iter = resources.iterator();
        UURI codebaseURI = null;
        String res = null;
        try {
            if (codebase != null) {
                // TODO: Pass in the charset.
                codebaseURI = UURIFactory.getInstance(curi.getUURI(), codebase);
            }
            while (iter.hasNext()) {
                res = iter.next();
                res = StringEscapeUtils.unescapeHtml(res);
                if (codebaseURI != null) {
                    res = codebaseURI.resolve(res).toString();
                }
                processEmbed(curi, res, element); // TODO: include attribute
                                                    // too
            }
        } catch (URIException e) {
View Full Code Here

Examples of org.archive.net.UURI

     *         should be terminated for some other reason.  False if
     *         we can proceed to process this url.
     */
    protected boolean considerRobotsPreconditions(CrawlURI curi) {
        // treat /robots.txt fetches specially
        UURI uuri = curi.getUURI();
        try {
            if (uuri != null && uuri.getPath() != null &&
                    curi.getUURI().getPath().equals("/robots.txt")) {
                // allow processing to continue
                curi.setPrerequisite(true);
                return false;
            }
View Full Code Here

Examples of org.archive.net.UURI

                    // other HREFs treated as links
                    processLink(curi, value, context);
                }
                if (elementStr.equalsIgnoreCase(BASE)) {
                    try {
                        UURI base = UURIFactory.getInstance(value.toString());
                        curi.setBaseURI(base);
                    } catch (URIException e) {
                        logUriError(e, curi.getUURI(), value);
                    }
                }
            } else if (attr.start(3) > -1) {
                // ACTION
                if (!ignoreFormActions) {
                    action = value;
                    actionContext = elementContext(element, attr.group(3));
                    // handling finished only at end (after METHOD also collected)
                }
            } else if (attr.start(4) > -1) {
                // ON____
                processScriptCode(curi, value); // TODO: context?
            } else if (attr.start(5) > -1) {
                // SRC etc.
                CharSequence context = elementContext(element, attr.group(5));
               
                // true, if we expect another HTML page instead of an image etc.
                final Hop hop;
               
                if(!framesAsEmbeds
                    && (elementStr.equalsIgnoreCase(FRAME) || elementStr
                        .equalsIgnoreCase(IFRAME))) {
                    hop = Hop.NAVLINK;
                } else {
                    hop = Hop.EMBED;
                }
                processEmbed(curi, value, context, hop);
            } else if (attr.start(6) > -1) {
                // CODEBASE
                codebase = (value instanceof String)?
                    (String)value: value.toString();
                CharSequence context = elementContext(element,
                    attr.group(6));
                processLink(curi, codebase, context);
            } else if (attr.start(7) > -1) {
                // CLASSID, DATA
                if (resources == null) {
                    resources = new ArrayList<String>();
                }
                resources.add(value.toString());
            } else if (attr.start(8) > -1) {
                // ARCHIVE
                if (resources==null) {
                    resources = new ArrayList<String>();
                }
                String[] multi = TextUtils.split(WHITESPACE, value);
                for(int i = 0; i < multi.length; i++ ) {
                    resources.add(multi[i]);
                }
            } else if (attr.start(9) > -1) {
                // CODE
                if (resources==null) {
                    resources = new ArrayList<String>();
                }
                // If element is applet and code value does not end with
                // '.class' then append '.class' to the code value.
                if (elementStr.equalsIgnoreCase(APPLET) &&
                        !value.toString().toLowerCase().endsWith(CLASSEXT)) {
                    resources.add(value.toString() + CLASSEXT);
                } else {
                    resources.add(value.toString());
                }
            } else if (attr.start(10) > -1) {
                // VALUE, with possibility of URI
                // store value, context for handling at end
                valueVal = value;
                valueContext = elementContext(element,attr.group(10));
            } else if (attr.start(11) > -1) {
                // STYLE inline attribute
                // then, parse for URIs
                numberOfLinksExtracted.addAndGet(ExtractorCSS.processStyleCode(
                        this, curi, value));       
            } else if (attr.start(12) > -1) {
                // METHOD
                method = value;
                // form processing finished at end (after ACTION also collected)
            } else if (attr.start(13) > -1) {
                if("NAME".equalsIgnoreCase(attrName.toString())) {
                    // remember 'name' for end-analysis
                    nameVal = value;
                }
                if("FLASHVARS".equalsIgnoreCase(attrName.toString())) {
                    // consider FLASHVARS attribute immediately
                    valueContext = elementContext(element,attr.group(13));
                    considerQueryStringValues(curi, value, valueContext,Hop.SPECULATIVE);
                }
                // any other attribute
                // ignore for now
                // could probe for path- or script-looking strings, but
                // those should be vanishingly rare in other attributes,
                // and/or symptomatic of page bugs
            }
        }
        TextUtils.recycleMatcher(attr);

        // handle codebase/resources
        if (resources != null) {
            Iterator<String> iter = resources.iterator();
            UURI codebaseURI = null;
            String res = null;
            try {
                if (codebase != null) {
                    // TODO: Pass in the charset.
                    codebaseURI = UURIFactory.
                        getInstance(curi.getUURI(), codebase);
                }
                while(iter.hasNext()) {
                    res = iter.next().toString();
                    res = (String) TextUtils.unescapeHtml(res);
                    if (codebaseURI != null) {
                        res = codebaseURI.resolve(res).toString();
                    }
                    processEmbed(curi, res, element); // TODO: include attribute too
                }
            } catch (URIException e) {
                curi.getNonFatalFailures().add(e);
View Full Code Here

Examples of org.archive.net.UURI

        ArrayList<UURI> list = new ArrayList<UURI>(1000);
        int count = 0;
        final int MAX_COUNT = 1000;
        for (; count < MAX_COUNT; count++) {
            assertEquals("count off",count,filter.count());
            UURI u = UURIFactory.getInstance("http://www" +
                    count + ".archive.org/" + count + "/index.html");
            assertFalse("already contained "+u.toString(),filter.bloom.contains(u.toString()));
            logger.fine("adding "+u.toString());
            filter.add(u.toString(), new CrawlURI(u));
            assertTrue("not in bloom",filter.bloom.contains(u.toString()));
            if (count > 0 && ((count % 100) == 0)) {
                list.add(u);
            }
        }
        logger.fine("Added " + count + " in " +
                (System.currentTimeMillis() - start));

        start = System.currentTimeMillis();
        for (Iterator<UURI> i = list.iterator(); i.hasNext();) {
            UURI uuri = i.next();
            filter.add(uuri.toString(), new CrawlURI(uuri));
        }
        logger.fine("Readded subset " + list.size() + " in " +
                (System.currentTimeMillis() - start));

        assertTrue("Count is off: " + filter.count(),
View Full Code Here

Examples of org.archive.net.UURI

    throws DatabaseException, URIException {
        long start = System.currentTimeMillis();
        ArrayList<UURI> list = new ArrayList<UURI>(1000);
        int count = 0;
        for (; count < max; count++) {
            UURI u = UURIFactory.getInstance("http://www" +
                count + ".archive.org/" + count + "/index.html");
            this.filter.add(u.toString(), new CrawlURI(u));
            if (count > 0 && ((count % 100) == 0)) {
                list.add(u);
            }
            if (count > 0 && ((count % 100000) == 0)) {
                this.logger.info("Added " + count + " in " +
                    (System.currentTimeMillis() - start) +
                    " misses " +
                    ((BdbUriUniqFilter)this.filter).getCacheMisses() +
                    " diff of misses " +
                    ((BdbUriUniqFilter)this.filter).getLastCacheMissDiff());
            }
        }
        this.logger.info("Added " + count + " in " +
            (System.currentTimeMillis() - start));
       
        start = System.currentTimeMillis();
        for (Iterator<UURI> i = list.iterator(); i.hasNext();) {
            UURI uuri = i.next();
            this.filter.add(uuri.toString(), new CrawlURI(uuri));
        }
        this.logger.info("Added random " + list.size() + " in " +
                (System.currentTimeMillis() - start));
       
        start = System.currentTimeMillis();
        for (Iterator<UURI> i = list.iterator(); i.hasNext();) {
            UURI uuri = i.next();
            this.filter.add(uuri.toString(), new CrawlURI(uuri));
        }
        this.logger.info("Deleted random " + list.size() + " in " +
            (System.currentTimeMillis() - start));
        // Looks like delete doesn't work.
        assertTrue("Count is off: " + this.filter.count(),
View Full Code Here

Examples of org.archive.net.UURI

                                                     // minus '.'
            // Does not begin with scheme, so try http://
            uri = "http://" + uri;
        }
        try {
            UURI uuri = UURIFactory.getInstance(uri);
            CrawlURI curi = new CrawlURI(uuri);
            curi.setSeed(true);
            curi.setSchedulingDirective(SchedulingConstants.MEDIUM);
            if (getSourceTagSeeds()) {
                curi.setSourceTag(curi.toString());
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.