Package net.yacy.document

Examples of net.yacy.document.Document


        }

        /* ===========================================================================
         * PARSE RESOURCE
         * =========================================================================== */
        Document document = null;
        try {
            document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
        } catch (final Parser.Failure e) {
            init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
            return;
        }
        if (document == null) {
            init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
            return;
        }

        /* ===========================================================================
         * COMPUTE SNIPPET
         * =========================================================================== */
        // we have found a parseable non-empty file: use the lines

        // compute snippet from text
        final Collection<StringBuilder> sentences = document.getSentences(pre);
        if (sentences == null) {
            init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
            return;
        }
        final SnippetExtractor tsr;
        String textline = null;
        HandleSet remainingHashes = queryhashes;
        try {
            tsr = new SnippetExtractor(sentences, queryhashes, snippetMaxLength);
            textline = tsr.getSnippet();
            remainingHashes =  tsr.getRemainingWords();
        } catch (final UnsupportedOperationException e) {
            init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
            return;
        }

        // compute snippet from media
        //String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
        //String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes);
        //String appline = computeMediaSnippet(document.getApplinks(), queryhashes);
        //String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes);
        //String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes);

        snippetLine = "";
        //if (audioline != null) line += (line.length() == 0) ? audioline : "<br />" + audioline;
        //if (videoline != null) line += (line.length() == 0) ? videoline : "<br />" + videoline;
        //if (appline   != null) line += (line.length() == 0) ? appline   : "<br />" + appline;
        //if (hrefline  != null) line += (line.length() == 0) ? hrefline  : "<br />" + hrefline;
        if (textline  != null) snippetLine += (snippetLine.length() == 0) ? textline  : "<br />" + textline;

        if (snippetLine == null || !remainingHashes.isEmpty()) {
            init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
            return;
        }
        if (snippetLine.length() > snippetMaxLength) snippetLine = snippetLine.substring(0, snippetMaxLength);

        // finally store this snippet in our own cache
        snippetsCache.put(wordhashes, urls, snippetLine);

        document.close();
        init(url.hash(), snippetLine, source, null);
    }
View Full Code Here


            final BObject nameo = info.get("name");
            if (nameo != null) title = UTF8.String(nameo.getString());
        }
        if (title == null || title.length() == 0) title = MultiProtocolURI.unescape(location.getFileName());
        try {
            return new Document[]{new Document(
                    location,
                    mimeType,
                    charset,
                    this,
                    null,
View Full Code Here

                }
            }

            final String[] sections = parsedNames.toArray(new String[parsedNames.size()]);
            final byte[] text = UTF8.getBytes(parsedDataText.toString());
            return new Document[]{new Document(
                    url,                        // url of the source document
                    mimeType,                   // the documents mime type
                    null,                       // charset
                    this,
                    null,                       // set of languages
View Full Code Here

            if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
           
            // create the parser document
            Document[] docs = null;
            final byte[] contentBytes = UTF8.getBytes(writer.toString());
            docs = new Document[]{new Document(
                    location,
                    mimeType,
                    "UTF-8",
                    this,
                    languages,
View Full Code Here

            final RTFEditorKit theRtfEditorKit = new RTFEditorKit();              
            theRtfEditorKit.read(source, doc, 0);           
           
            final String bodyText = doc.getText(0, doc.getLength());
           
            return new Document[]{new Document(
                    location,
                    mimeType,
                    "UTF-8",
                    this,
                    null,
View Full Code Here

     * all extracted information about the parsed document
     */
    public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source)
            throws Parser.Failure, InterruptedException {

      Document theDoc = null;
     
        try {
            String contents = "";
            SummaryInformation summary = null;
            try {
                final VisioTextExtractor extractor = new VisioTextExtractor(source);
              contents = extractor.getText();
                summary = extractor.getSummaryInformation();
            } catch (Exception e) {
              Log.logWarning("vsdParser", e.getMessage());
            }

            String author = null;
            String[] keywords = null;
            String title = null;
            if (summary != null) {
                author = summary.getAuthor();
                if (summary.getKeywords() != null) {
                    keywords = summary.getKeywords().split("[ ,;]");
                }
                title = summary.getTitle();
            }

            String abstrct = null;
            abstrct = ((contents.length() > 80)? contents.substring(0, 80) : contents.trim()).
                          replaceAll("\r\n"," ").
                          replaceAll("\n"," ").
                          replaceAll("\r"," ").
                          replaceAll("\t"," ");
           
            if (title == null) {
                title = abstrct;
            }

           // As the result of parsing this function must return a plasmaParserDocument object
            return new Document[]{new Document(
                    location,     // url of the source document
                    mimeType,     // the documents mime type
                    "UTF-8",      // charset of the document text
                    this,
                    null,         // language
View Full Code Here

                anchors.put(new MultiProtocolURI(url), p);
                contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
            }

           // As the result of parsing this function must return a plasmaParserDocument object
            return new Document[]{new Document(
                    location,     // url of the source document
                    mimeType,     // the documents mime type
                    "UTF-8",      // charset of the document text
                    this,
                    null,
View Full Code Here

        //RSSMessage channel = feed.getChannel();
        final List<Document> docs = new ArrayList<Document>();
        MultiProtocolURI uri;
        Set<String> languages;
        Map<MultiProtocolURI, Properties> anchors;
        Document doc;
        for (final Hit item: feed) try {
            uri = new MultiProtocolURI(item.getLink());
            languages = new HashSet<String>();
            languages.add(item.getLanguage());
            anchors = new HashMap<MultiProtocolURI, Properties>();
            Properties p = new Properties();
            p.put("name", item.getTitle());
            anchors.put(uri, p);
            doc = new Document(
                    uri,
                    TextParser.mimeOf(url),
                    charset,
                    this,
                    languages,
View Full Code Here

                        break;
                    default:
                        throw new Parser.Failure("Unable to parse SID file, unexpected version: " + version, location);
                }

                return new Document[]{new Document(
                        location,
                        mimeType,
                        "UTF-8",
                        this,
                        null,
View Full Code Here

                this.log.logWarning("Rejected URL '" + surrogate.getIdentifier(true) + "': " + urlRejectReason);
                continue;
            }

            // create a queue entry
            final Document document = surrogate.document();
            final Request request = new Request(
                    ASCII.getBytes(this.peers.mySeed().hash),
                    surrogate.getIdentifier(true),
                    null,
                    "",
View Full Code Here

TOP

Related Classes of net.yacy.document.Document

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.