Package net.yacy.cora.document

Examples of net.yacy.cora.document.MultiProtocolURI


                urlEnd = contents.indexOf(linebreak,urlStart);
                url = contents.substring(urlStart,urlEnd);
                urlnr = Integer.toString(++urls).toString();
                Properties p = new Properties();
                p.put("name", urlnr);
                anchors.put(new MultiProtocolURI(url), p);
                contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
            }

           // As the result of parsing this function must return a plasmaParserDocument object
            return new Document[]{new Document(
View Full Code Here


            // add all images also to the crawl stack
            hl.putAll(Document.getImagelinks(documents));

            // insert those hyperlinks to the crawler
            MultiProtocolURI nextUrl;
            for (final Map.Entry<MultiProtocolURI, String> nextEntry : hl.entrySet()) {
                // check for interruption
                checkInterruption();

                // process the next hyperlink
                nextUrl = nextEntry.getKey();
                final String u = nextUrl.toNormalform(true, true, false, true);
                if (!(u.startsWith("http://") || u.startsWith("https://") || u.startsWith("ftp://") || u.startsWith("smb://") || u.startsWith("file://"))) continue;
                // enqueue the hyperlink into the pre-notice-url db
                try {
                    this.crawlStacker.enqueueEntry(new Request(
                            response.initiator(),
View Full Code Here

                    Log.logInfo("heuristicRSS", "rss result not parsed from " + feedName);
                    return;
                }

                final Map<MultiProtocolURI, String> links = new TreeMap<MultiProtocolURI, String>();
                MultiProtocolURI uri;
                for (final RSSMessage message: rss.getFeed()) try {
                    uri = new MultiProtocolURI(message.getLink());
                    links.put(uri, message.getTitle());
                } catch (final MalformedURLException e) {
                }

                Log.logInfo("heuristicRSS", "Heuristic: adding " + links.size() + " links from '" + feedName + "' rss feed");
View Full Code Here

        }
       
        final RSSFeed feed = rssReader.getFeed();
        //RSSMessage channel = feed.getChannel();
        final List<Document> docs = new ArrayList<Document>();
        MultiProtocolURI uri;
        Set<String> languages;
        Map<MultiProtocolURI, Properties> anchors;
        Document doc;
        for (final Hit item: feed) try {
            uri = new MultiProtocolURI(item.getLink());
            languages = new HashSet<String>();
            languages.add(item.getLanguage());
            anchors = new HashMap<MultiProtocolURI, Properties>();
            Properties p = new Properties();
            p.put("name", item.getTitle());
View Full Code Here

                final int idx = name.lastIndexOf('.');
                final String mime = TextParser.mimeOf((idx >= 0) ? name.substring(idx + 1) : "");
                try {
                    tmp = FileUtils.createTempFile(this.getClass(), name);
                    FileUtils.copy(zis, tmp, entry.getSize());
                    final MultiProtocolURI virtualURL = MultiProtocolURI.newURL(url, "#" + name);
                    //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
                    docs = TextParser.parseSource(virtualURL, mime, null, tmp);
                    if (docs == null) continue;
                    for (final Document d: docs) docacc.add(d);
                } catch (final Parser.Failure e) {
View Full Code Here

   
   
    public static void main(final String[] args) {
        File image = new File(args[0]);
        genericImageParser parser = new genericImageParser();
        MultiProtocolURI uri;
        try {
            uri = new MultiProtocolURI("http://localhost/" + image.getName());
            Document[] document = parser.parse(uri, "image/" + uri.getFileExtension(), "UTF-8", new FileInputStream(image));
            System.out.println(document[0].toString());
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
View Full Code Here

     */
    public MultiProtocolURI getSitemap() {
        String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null;
        if (url == null) return null;
        try {
            return new MultiProtocolURI(url);
        } catch (MalformedURLException e) {
            return null;
        }
    }
View Full Code Here

                    System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() <= 1*24*60*60*1000) {
                    return robotsTxt4Host;
                }
               
                // generating the proper url to download the robots txt
                MultiProtocolURI robotsURL = null;
                try {                
                    robotsURL = new MultiProtocolURI("http://" + urlHostPort + "/robots.txt");
                } catch (final MalformedURLException e) {
                    log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
                    robotsURL = null;
                }
               
View Full Code Here

                } else {
               
                    redirectionUrlString = redirectionUrlString.trim();
                   
                    // generating the new URL object
                    final MultiProtocolURI redirectionUrl = MultiProtocolURI.newURL(robotsURL, redirectionUrlString);     
                   
                    // following the redirection
                    if (log.isDebugEnabled()) log.debug("Redirection detected for robots.txt with URL '" + robotsURL + "'." +
                            "\nRedirecting request to: " + redirectionUrl);
                    return downloadRobotsTxt(redirectionUrl,redirectionCount,entry);
View Full Code Here

        } else {
            port = Integer.parseInt(host.substring(pos + 1));
            host = host.substring(0, pos);
        }
       
        final MultiProtocolURI url = new MultiProtocolURI("http", host, port, (args == null) ? path : path + "?" + args);
        return url;
    }
View Full Code Here

TOP

Related Classes of net.yacy.cora.document.MultiProtocolURI

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.