Package net.yacy.document

Examples of net.yacy.document.Document


        //RSSMessage channel = feed.getChannel();
        final List<Document> docs = new ArrayList<Document>();
        MultiProtocolURI uri;
        Set<String> languages;
        Map<MultiProtocolURI, Properties> anchors;
        Document doc;
        for (final Hit item: feed) try {
            uri = new MultiProtocolURI(item.getLink());
            languages = new HashSet<String>();
            languages.add(item.getLanguage());
            anchors = new HashMap<MultiProtocolURI, Properties>();
            Properties p = new Properties();
            p.put("name", item.getTitle());
            anchors.put(uri, p);
            doc = new Document(
                    uri,
                    TextParser.mimeOf(url),
                    charset,
                    this,
                    languages,
View Full Code Here


                        break;
                    default:
                        throw new Parser.Failure("Unable to parse SID file, unexpected version: " + version, location);
                }

                return new Document[]{new Document(
                        location,
                        mimeType,
                        "UTF-8",
                        this,
                        null,
View Full Code Here

                }
            }

            final String[] sections = parsedNames.toArray(new String[parsedNames.size()]);
            final byte[] text = UTF8.getBytes(parsedDataText.toString());
            return new Document[]{new Document(
                    url,                        // url of the source document
                    mimeType,                   // the documents mime type
                    null,                       // charset
                    this,
                    null,                       // set of languages
View Full Code Here

                this.log.logWarning("Rejected URL '" + surrogate.getIdentifier(true) + "': " + urlRejectReason);
                continue;
            }

            // create a queue entry
            final Document document = surrogate.document();
            final Request request = new Request(
                    ASCII.getBytes(this.peers.mySeed().hash),
                    surrogate.getIdentifier(true),
                    null,
                    "",
View Full Code Here

        if (queryhashes.isEmpty()) {
            Log.logFine("snippet fetch", "no query hashes given for url " + url);
            return new ArrayList<MediaSnippet>();
        }

        Document document;
        try {
            document = Document.mergeDocuments(url, null, Switchboard.getSwitchboard().loader.loadDocuments(Switchboard.getSwitchboard().loader.request(url, false, reindexing), cacheStrategy, timeout, Integer.MAX_VALUE));
        } catch (final IOException e) {
            Log.logFine("snippet fetch", "load error: " + e.getMessage());
            return new ArrayList<MediaSnippet>();
View Full Code Here

                sentences = Document.getSentences(pre, new ByteArrayInputStream(UTF8.getBytes(solrText)));
            }

            // if then no sentences are found, we fail-over to get the content from the re-loaded document
            if (sentences == null) {
                final Document document = loadDocument(loader, comp, queryhashes, cacheStrategy, url, reindexing, source);
                if (document == null) {
                    return;
                }

                // compute sentences from parsed document
                sentences = document.getSentences(pre);
                document.close();

                if (sentences == null) {
                    init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
                    return;
                }
View Full Code Here

                containsAllHashes(loc = comp.dc_creator(), queryhashes) ||
                containsAllHashes(loc = comp.dc_subject(), queryhashes) ||
                containsAllHashes(loc = comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) {
                // try to create the snippet from information given in the url
                if (inCache) response = loader == null ? null : loader.load(request, CacheStrategy.CACHEONLY, true);
                Document document = null;
                if (response != null) {
                    try {
                        document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
                    } catch (final Parser.Failure e) {
                    }
                }
                init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
                return document;
            } else {
                // try to load the resource from the cache
                response = loader == null ? null : loader.load(request, noCacheUsage ? CacheStrategy.NOCACHE : cacheStrategy, true);
                if (response == null) {
                    // in case that we did not get any result we can still return a success when we are not allowed to go online
                    if (cacheStrategy == null || cacheStrategy.mustBeOffline()) {
                        init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
                        return null;
                    }

                    // if it is still not available, report an error
                    init(url.hash(), null, ResultClass.ERROR_RESOURCE_LOADING, "error loading resource from net, no cache entry");
                    return null;
                } else {
                    // place entry on indexing queue
                    Switchboard.getSwitchboard().toIndexer(response);
                    source = ResultClass.SOURCE_WEB;
                }
            }
        } catch (final Exception e) {
            //Log.logException(e);
            init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
            return null;
        }

        /* ===========================================================================
         * PARSE RESOURCE
         * =========================================================================== */
        Document document = null;
        try {
            document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
        } catch (final Parser.Failure e) {
            init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
            return null;
View Full Code Here

        final URIMetadataRow.Components metadata = entry.metadata();
        if (metadata == null || metadata.url() == null) return 0;

        try {
            // parse the resource
            final Document document = Document.mergeDocuments(metadata.url(), null, loader.loadDocuments(loader.request(metadata.url(), true, false), cacheStrategy, 10000, Integer.MAX_VALUE));
            if (document == null) {
                // delete just the url entry
                urlMetadata().remove(urlhash);
                return 0;
            }
View Full Code Here

TOP

Related Classes of net.yacy.document.Document

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.