Examples of UURI


Examples of org.archive.net.UURI

     * Test a POST FORM ACTION being found with non-default setting
     *
     * @throws URIException
     */
    public void testFormsLinkFindPost() throws URIException {
        UURI uuri = UURIFactory.getInstance("http://www.example.org");
        CrawlURI curi = new CrawlURI(uuri);
        CharSequence cs =
            "<form name=\"testform\" method=\"POST\" action=\"redirect_me?form=true\"> " +
            "  <INPUT TYPE=CHECKBOX NAME=\"checked[]\" VALUE=\"1\" CHECKED> "+
            "  <INPUT TYPE=CHECKBOX NAME=\"unchecked[]\" VALUE=\"1\"> " +
View Full Code Here

Examples of org.archive.net.UURI

            }
        }));
    }
   
    public void testMultipleAttributesPerElement() throws URIException {
        UURI uuri = UURIFactory.getInstance("http://www.example.org");
        CrawlURI curi = new CrawlURI(uuri);
        CharSequence cs = "<a src=\"http://www.example.com/\" href=\"http://www.archive.org/\"> ";
        getExtractor().extract(curi, cs);
        assertTrue("not all links found", curi.getOutLinks().size() == 2);
    }
View Full Code Here

Examples of org.archive.net.UURI

        return Recorder.getHttpRecorder();
    }

    protected CrawlURI makeCrawlURI(String uri) throws URIException,
            IOException {
        UURI uuri = UURIFactory.getInstance(uri);
        CrawlURI curi = new CrawlURI(uuri);
        curi.setSeed(true);
        curi.setRecorder(getRecorder());
        return curi;
    }
View Full Code Here

Examples of org.archive.net.UURI

    }
   
   
    @Override
    protected void innerProcess(CrawlURI curi) {
        UURI uuri = curi.getUURI(); // Current URI.

        // Only http and https schemes are supported.
        String scheme = uuri.getScheme();
        if (!"http".equalsIgnoreCase(scheme)
                && !"https".equalsIgnoreCase(scheme)) {
            return;
        }
        RecordingInputStream recis = curi.getRecorder().getRecordedInput();
        if (0L == recis.getResponseContentLength()) {
            return;
        }

        String baseDir = getPath().getFile().getAbsolutePath();

        // Already have a path for this URI.
        boolean reCrawl = curi.getData().containsKey(A_MIRROR_PATH);

        /*
          The file system path, relative to the value of ATTR_PATH, where
          this resource should be written.  The intent is to
          add later a persistent mapping from URI to path.
          This will allow a URI to be re-crawled and updated
          if it has changed.  If the resource has already been fetched
          and written to a file before, the path to that file
          has already been obtained from the persistent mapping
          and placed on the AList by some other module,
          such as the frontier.
        */
        String mps = null;
        File destFile = null; // Write resource contents to this file.
        try {
            if (reCrawl) {
                mps = (String)curi.getData().get(A_MIRROR_PATH);
                destFile = new File(baseDir + File.separator + mps);
                File parent = destFile.getParentFile();
                if (null != parent) {
                    FileUtils.ensureWriteableDirectory(parent);
                }
            } else {
                URIToFileReturn r = null; // Return from uriToFile().
                try {
                     r = uriToFile(baseDir, curi);
                } catch (AttributeNotFoundException e) {
                    logger.warning(e.getLocalizedMessage());
                    return;
                }
                destFile = r.getFile();
                mps = r.getRelativePath();
            }
            logger.info(uuri.toString() + " -> " + destFile.getPath());
            writeToPath(recis, destFile);
            if (!reCrawl) {
                curi.getData().put(A_MIRROR_PATH, mps);
            }
        } catch (IOException e) {
View Full Code Here

Examples of org.archive.net.UURI

       @throws IOException
       if a non-directory file exists with the same path as a needed directory
    */
    private URIToFileReturn uriToFile(String baseDir, CrawlURI curi)
        throws AttributeNotFoundException, IOException {
        UURI uuri = curi.getUURI(); // Current URI.
        String host = null;
        boolean hd = getCreateHostDirectory();
        if (hd) {
            host = uuri.getHost();
            List<String> hostMap = getHostMap();
            if ((null != hostMap) && (hostMap.size() > 1)) {
                ensurePairs(hostMap);
                Iterator<String> i = hostMap.iterator();
                for (boolean more = true; more && i.hasNext();) {
                    String h1 = i.next();
                    String h2 = i.next();
                    if (host.equalsIgnoreCase(h1)) {
                        more = false;
                        if ((null != h2) && (0 != h2.length())) {
                            host = h2;
                        }
                    }
                }
            }
        }

        int port = getCreatePortDirectory() ? uuri.getPort() : -1;

        String suffix = null; // Replacement suffix.
        List<String> ctm = getContentTypeMap();
        if ((null != ctm) && (ctm.size() > 1)) {
            ensurePairs(ctm);
            String contentType = curi.getContentType().toLowerCase();
            Iterator<String> i = ctm.iterator();
            for (boolean more = true; more && i.hasNext();) {
                String ct = (String) i.next();
                String suf = (String) i.next();
                if ((null != ct) && contentType.startsWith(ct.toLowerCase())) {
                    more = false;
                    if ((null != suf) && (0 != suf.length())) {
                        suffix = suf;
                    }
                }
            }
        }

        int maxSegLen = getMaxSegLength();
        if (maxSegLen < 2) {
            maxSegLen = 2; // MAX_SEG_LEN.getDefaultValue();
        }

        int maxPathLen = getMaxPathLength();
        if (maxPathLen < 2) {
            maxPathLen = 2; // MAX_PATH_LENGTH.getDefaultValue();
        }

        Map<String,String> characterMap = Collections.emptyMap();
        List<String> cm = getCharacterMap();
        if ((null != cm) && (cm.size() > 1)) {
            ensurePairs(cm);
            characterMap = new HashMap<String,String>(cm.size());
            // Above will be half full.
            for (Iterator<String> i = cm.iterator(); i.hasNext();) {
                String s1 = (String) i.next();
                String s2 = (String) i.next();
                if ((null != s1) && (1 == s1.length()) && (null != s2)
                        && (0 != s2.length())) {
                    characterMap.put(s1, s2);
                }
            }
        }

        String dotBegin = getDotBegin();
        if (".".equals(dotBegin)) {
            dotBegin = null;
        }

        String dotEnd = getDotEnd();
        if (".".equals(dotEnd)) {
            dotEnd = null;
        }

        String tld = getTooLongDirectory();
        if ((null == tld) || (0 == tld.length())
                || (-1 != tld.indexOf(File.separatorChar))) {
            tld = "LONG"; // TOO_LONG_DIRECTORY.getDefaultValue();
        }

        Set<String> underscoreSet = null;
        List<String> us = getUnderscoreSet();
        if ((null != us) && (0 != us.size())) {
            underscoreSet = new HashSet<String>(us.size(), 0.5F);
            for (String s: us) {
                if ((null != s) && (0 != s.length())) {
                    underscoreSet.add(s.toLowerCase());
                }
            }
        }

        return uriToFile(curi, host, port, uuri.getPath(), uuri.getQuery(),
            suffix, baseDir, maxSegLen, maxPathLen,
            getCaseSensitiveFilesystem(),
            getDirectoryFile(),
            characterMap, dotBegin, dotEnd, tld,
            getSuffixAtEnd(),
View Full Code Here

Examples of org.archive.net.UURI

        @SuppressWarnings("unchecked")
        protected CrawlURI makeCrawlUri(JSONObject jo) throws URIException,
                JSONException {
            JSONObject joHeaders = jo.getJSONObject("headers");

            UURI uuri = UURIFactory.getInstance(jo.getString("url"));
            UURI via = UURIFactory.getInstance(jo.getString("parentUrl"));

            JSONObject parentUrlMetadata = jo.getJSONObject("parentUrlMetadata");
            String parentHopPath = parentUrlMetadata.getString("pathFromSeed");
            String hopPath = parentHopPath + Hop.INFERRED.getHopString();
View Full Code Here

Examples of org.archive.net.UURI

     * @throws InterruptedException  if the thread is interrupted
     */
    private void fetch(CrawlURI curi, ClientFTP client, Recorder recorder)
    throws IOException, InterruptedException {
        // Connect to the FTP server.
        UURI uuri = curi.getUURI();
        int port = uuri.getPort();
        if (port == -1) {
            port = 21;
        }

        if (socketFactory == null) {
            socketFactory = new SocketFactoryWithTimeout();
        }
        socketFactory.setConnectTimeoutMs(getSoTimeoutMs());
        client.setSocketFactory(socketFactory);
        client.setConnectTimeout(getSoTimeoutMs());
        client.setDefaultTimeout(getSoTimeoutMs());
        client.setDataTimeout(getSoTimeoutMs());
       
        client.connect(uuri.getHost(), port);
       
        client.setSoTimeout(getSoTimeoutMs())// must be after connect()
       
        // Authenticate.
        String[] auth = getAuth(curi);
        client.login(auth[0], auth[1]);
       
        // The given resource may or may not be a directory.
        // To figure out which is which, execute a CD command to
        // the UURI's path.  If CD works, it's a directory.
        boolean isDirectory = client.changeWorkingDirectory(uuri.getPath());

        // Get a data socket.  This will either be the result of a NLST
        // command for a directory, or a RETR command for a file.
        int command;
        String path;
        if (isDirectory) {
            curi.getAnnotations().add("ftpDirectoryList");
            command = FTPCommand.NLST;
            client.setFileType(FTP.ASCII_FILE_TYPE);
            path = ".";
        } else {
            command = FTPCommand.RETR;
            client.setFileType(FTP.BINARY_FILE_TYPE);
            path = uuri.getPath();
        }

        client.enterLocalPassiveMode();
        Socket socket = null;

View Full Code Here

Examples of org.archive.net.UURI

        String base = curi.toString();
        if (base.endsWith("/")) {
            base = base.substring(0, base.length() - 1);
        }
        try {
            UURI n = UURIFactory.getInstance(base + "/" + file);
            CrawlURI link = curi.createCrawlURI(n, LinkContext.NAVLINK_MISC, Hop.NAVLINK);
            curi.getOutLinks().add(link);
        } catch (URIException e) {
            logger.log(Level.WARNING, "URI error during extraction.", e);           
        }
View Full Code Here

Examples of org.archive.net.UURI

     */
    private void addParent(CrawlURI curi) {
        if (!getExtractParent()) {
            return;
        }
        UURI uuri = curi.getUURI();
        try {
            if (uuri.getPath().equals("/")) {
                // There's no parent to add.
                return;
            }
            String scheme = uuri.getScheme();
            String auth = uuri.getEscapedAuthority();
            String path = uuri.getEscapedCurrentHierPath();
            UURI parent = UURIFactory.getInstance(scheme + "://" + auth + path);

            CrawlURI link = curi.createCrawlURI(parent, LinkContext.NAVLINK_MISC,
                    Hop.NAVLINK);
            curi.getOutLinks().add(link);
        } catch (URIException e) {
View Full Code Here

Examples of org.archive.net.UURI

     * @param curi  the curi whose username and password to return
     * @return  an array containing the username and password
     */
    private String[] getAuth(CrawlURI curi) {
        String[] result = new String[2];
        UURI uuri = curi.getUURI();
        String userinfo;
        try {
            userinfo = uuri.getUserinfo();
        } catch (URIException e) {
            assert false;
            logger.finest("getUserinfo raised URIException.");
            userinfo = null;
        }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.