Package net.yacy.document

Examples of net.yacy.document.Document


                prop.put("viewMode_html", 1);
                prop.put("viewMode_html_url", url.toNormalform(false, true));
            }
        } else if (viewMode.equals("parsed") || viewMode.equals("sentences"|| viewMode.equals("words") || viewMode.equals("links")) {
            // parsing the resource content
            Document document = null;
            try {
                document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
                if (document == null) {
                    prop.put("error", "5");
                    prop.put("error_errorText", "Unknown error");
                    prop.put("viewMode", VIEW_MODE_NO_TEXT);
                    return prop;
                }
            } catch (final Parser.Failure e) {
                prop.put("error", "5");
                prop.putHTML("error_errorText", e.getMessage());
                prop.put("viewMode", VIEW_MODE_NO_TEXT);
                return prop;
            }

            if (viewMode.equals("parsed")) {
                final String content = UTF8.String(document.getTextBytes());
                // content = wikiCode.replaceHTML(content); // added by Marc Nause
                prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT);
                prop.put("viewMode_title", document.dc_title());
                prop.put("viewMode_creator", document.dc_creator());
                prop.put("viewMode_subject", document.dc_subject(','));
                prop.put("viewMode_description", document.dc_description());
                prop.put("viewMode_publisher", document.dc_publisher());
                prop.put("viewMode_format", document.dc_format());
                prop.put("viewMode_identifier", document.dc_identifier());
                prop.put("viewMode_source", url.toString());
                prop.put("viewMode_lat", document.lat());
                prop.put("viewMode_lon", document.lon());
                prop.put("viewMode_parsedText", markup(wordArray, content).replaceAll("\n", "<br />").replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;"));

            } else if (viewMode.equals("sentences")) {
                prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES);
                final Collection<StringBuilder> sentences = document.getSentences(pre);

                boolean dark = true;
                int i = 0;
                String sentence;
                if (sentences != null) {

                    // Search word highlighting
                    for (final StringBuilder s: sentences) {
                        sentence = s.toString();
                        if (sentence.trim().length() > 0) {
                            prop.put("viewMode_sentences_" + i + "_nr", i + 1);
                            prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, sentence));
                            prop.put("viewMode_sentences_" + i + "_dark", dark ? "1" : "0");
                            dark = !dark;
                            i++;
                        }
                    }
                }
                prop.put("viewMode_sentences", i);

            } else if (viewMode.equals("words")) {
                prop.put("viewMode", VIEW_MODE_AS_PARSED_WORDS);
                final Collection<StringBuilder> sentences = document.getSentences(pre);

                boolean dark = true;
                int i = 0;
                String sentence, token;
                if (sentences != null) {

                    // Search word highlighting
                    for (final StringBuilder s: sentences) {
                        sentence = s.toString();
                        Enumeration<String> tokens = null;
                        tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib);
                        while (tokens.hasMoreElements()) {
                            token = tokens.nextElement();
                            if (token.length() > 0) {
                                prop.put("viewMode_words_" + i + "_nr", i + 1);
                                prop.put("viewMode_words_" + i + "_word", token);
                                prop.put("viewMode_words_" + i + "_dark", dark ? "1" : "0");
                                dark = !dark;
                                i++;
                            }
                        }
                    }
                }
                prop.put("viewMode_words", i);

            } else if (viewMode.equals("links")) {
                prop.put("viewMode", VIEW_MODE_AS_LINKLIST);
                boolean dark = true;
                int i = 0;
                i += putMediaInfo(prop, wordArray, i, document.getVideolinks(), "video", (i % 2 == 0));
                i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
                dark = (i % 2 == 0);

                final Map<MultiProtocolURI, ImageEntry> ts = document.getImages();
                final Iterator<ImageEntry> tsi = ts.values().iterator();
                ImageEntry entry;
                while (tsi.hasNext()) {
                    entry = tsi.next();
                    prop.put("viewMode_links_" + i + "_nr", i);
                    prop.put("viewMode_links_" + i + "_dark", dark ? "1" : "0");
                    prop.put("viewMode_links_" + i + "_type", "image");
                    prop.put("viewMode_links_" + i + "_text", (entry.alt().isEmpty()) ? "&nbsp;" : markup(wordArray, entry.alt()));
                    prop.put("viewMode_links_" + i + "_url", entry.url().toNormalform(false, true));
                    prop.put("viewMode_links_" + i + "_link", markup(wordArray, entry.url().toNormalform(false, true)));
                    if (entry.width() > 0 && entry.height() > 0)
                        prop.put("viewMode_links_" + i + "_attr", entry.width() + "x" + entry.height() + " Pixel");
                    else
                        prop.put("viewMode_links_" + i + "_attr", "unknown");
                    dark = !dark;
                    i++;
                }
                i += putMediaInfo(prop, wordArray, i, document.getApplinks(), "app", (i % 2 == 0));
                i += putMediaInfo(prop, wordArray, i, document.getHyperlinks(), "link", (i % 2 == 0));
                prop.put("viewMode_links", i);

            }
            if (document != null) document.close();
        }
        prop.put("error", "0");
        prop.put("error_url", url.toNormalform(false, true));
        prop.put("error_hash", urlHash);
        prop.put("error_wordCount", wordCount);
View Full Code Here


        try {
            documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1));
        } catch (Exception e) {
            throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
        }
        Document document = Document.mergeDocuments(url, null, documents);
        final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
        return super.storeDocument(
                url,
                null,
                new Date(url.lastModified()),
View Full Code Here

        final StringBuilder sb = new StringBuilder();
        for (final String[] row: table) {
            sb.append(concatRow(row)).append(' ');
        }
        try {
            return new Document[]{new Document(
                    location,
                    mimeType,
                    charset,
                    this,
                    null,
View Full Code Here

           
            /*
             * create the plasmaParserDocument for the database
             * and set shortText and bodyText properly
             */
            final Document[] docs = new Document[]{new Document(
                    location,
                    mimeType,
                    "UTF-8",
                    this,
                    null,
View Full Code Here

        if (queryhashes.isEmpty()) {
            Log.logFine("snippet fetch", "no query hashes given for url " + url);
            return new ArrayList<MediaSnippet>();
        }

        Document document;
        try {
            document = Document.mergeDocuments(url, null, Switchboard.getSwitchboard().loader.loadDocuments(Switchboard.getSwitchboard().loader.request(url, false, reindexing), cacheStrategy, timeout, Long.MAX_VALUE));
        } catch (final IOException e) {
            Log.logFine("snippet fetch", "load error: " + e.getMessage());
            return new ArrayList<MediaSnippet>();
View Full Code Here

   
    public Document document() {
        HashSet<String> languages = new HashSet<String>();
        languages.add(getLanguage());
       
        return new Document(
            getIdentifier(true),
            "text/html",
            "UTF-8",
            this,
            languages,
View Full Code Here

            null,
            false);
    }
   
    public void writeXML(OutputStreamWriter os) throws IOException {
        Document doc = document();
        if (doc != null) {
            doc.writeXML(os, this.getDate());
        }
    }
View Full Code Here

            } else {
                parseUsingJava(sourceFile,outputFile);
            }
           
            // return result
            final Document[] docs = new Document[]{new Document(
                    location, // url
                    mimeType, // mime
                    "UTF-8"// charset
                    this,
                    null,     // languages
View Full Code Here

               
                /*
                 * create the plasmaParserDocument for the database
                 * and set shortText and bodyText properly
                 */
                return new Document[]{new Document(
                        location,
                        mimeType,
                        "UTF-8",
                        this,
                        null,
View Full Code Here

        SUPPORTED_EXTENSIONS.add("7z");
        SUPPORTED_MIME_TYPES.add("application/x-7z-compressed");
    }
   
    public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException {
        final Document doc = new Document(
                location,
                mimeType,
                charset,
                this,
                null,
View Full Code Here

TOP

Related Classes of net.yacy.document.Document

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.