Package net.yacy.document

Examples of net.yacy.document.Document


        final StringBuilder sb = new StringBuilder();
        for (final String[] row: table) {
            sb.append(concatRow(row)).append(' ');
        }
        try {
            return new Document[]{new Document(
                    location,
                    mimeType,
                    charset,
                    this,
                    null,
View Full Code Here


           
            /*
             * create the plasmaParserDocument for the database
             * and set shortText and bodyText properly
             */
            final Document[] docs = new Document[]{new Document(
                    location,
                    mimeType,
                    "UTF-8",
                    this,
                    null,
View Full Code Here

            if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
           
            // create the parser document
            Document[] docs = null;
            final byte[] contentBytes = UTF8.getBytes(writer.toString());
            docs = new Document[]{new Document(
                    location,
                    mimeType,
                    "UTF-8",
                    this,
                    languages,
View Full Code Here

            final RTFEditorKit theRtfEditorKit = new RTFEditorKit();              
            theRtfEditorKit.read(source, doc, 0);           
           
            final String bodyText = doc.getText(0, doc.getLength());
           
            return new Document[]{new Document(
                    location,
                    mimeType,
                    "UTF-8",
                    this,
                    null,
View Full Code Here

     * all extracted information about the parsed document
     */
    public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source)
            throws Parser.Failure, InterruptedException {

      Document theDoc = null;
     
        try {
            String contents = "";
            SummaryInformation summary = null;
            try {
                final VisioTextExtractor extractor = new VisioTextExtractor(source);
              contents = extractor.getText();
                summary = extractor.getSummaryInformation();
            } catch (Exception e) {
              Log.logWarning("vsdParser", e.getMessage());
            }

            String author = null;
            String[] keywords = null;
            String title = null;
            if (summary != null) {
                author = summary.getAuthor();
                if (summary.getKeywords() != null) {
                    keywords = summary.getKeywords().split("[ ,;]");
                }
                title = summary.getTitle();
            }

            String abstrct = null;
            abstrct = ((contents.length() > 80)? contents.substring(0, 80) : contents.trim()).
                          replaceAll("\r\n"," ").
                          replaceAll("\n"," ").
                          replaceAll("\r"," ").
                          replaceAll("\t"," ");
           
            if (title == null) {
                title = abstrct;
            }

           // As the result of parsing this function must return a plasmaParserDocument object
            return new Document[]{new Document(
                    location,     // url of the source document
                    mimeType,     // the documents mime type
                    "UTF-8",      // charset of the document text
                    this,
                    null,         // language
View Full Code Here

   
    return phrases;
  }

  public static String autoTag(final String url, final LoaderDispatcher loader, final int max, final TreeMap<String, YMarkTag> tags) {
    final Document document = loadDocument(url, loader);
    if (document != null)
      return autoTag(document, max, tags);
    else
      return "/IOExceptions";
  }
View Full Code Here

                anchors.put(new MultiProtocolURI(url), p);
                contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
            }

           // As the result of parsing this function must return a plasmaParserDocument object
            return new Document[]{new Document(
                    location,     // url of the source document
                    mimeType,     // the documents mime type
                    "UTF-8",      // charset of the document text
                    this,
                    null,
View Full Code Here

            if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
           
            // create the parser document
            Document[] docs = null;
            final byte[] contentBytes = UTF8.getBytes(writer.toString());
            docs = new Document[]{new Document(
                    location,
                    mimeType,
                    "UTF-8",
                    this,
                    languages,
View Full Code Here

        // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
        // the great number of these objects can easily be seen in Java Visual VM
        // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
        COSName.clearResources();
        PDFont.clearResources();
        return new Document[]{new Document(
                location,
                mimeType,
                "UTF-8",
                this,
                null,
View Full Code Here

                System.out.println(pdfFile.getAbsolutePath());
                final long startTime = System.currentTimeMillis();

                // parse
                final AbstractParser parser = new pdfParser();
                Document document = null;
                try {
                    document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new FileInputStream(pdfFile)));
                } catch (final Parser.Failure e) {
                    System.err.println("Cannot parse file " + pdfFile.getAbsolutePath());
                    Log.logException(e);
                } catch (final InterruptedException e) {
                    System.err.println("Interrupted while parsing!");
                    Log.logException(e);
                } catch (final NoClassDefFoundError e) {
                    System.err.println("class not found: " + e.getMessage());
                } catch (final FileNotFoundException e) {
                    Log.logException(e);
                }

                // statistics
                System.out.println("\ttime elapsed: " + (System.currentTimeMillis() - startTime) + " ms");

                // output
                if (document == null) {
                    System.out.println("\t!!!Parsing without result!!!");
                } else {
                    System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
                    try {
                        // write file
                        FileUtils.copy(document.getText(), new File("parsedPdf.txt"));
                    } catch (final IOException e) {
                        System.err.println("error saving parsed document");
                        Log.logException(e);
                    }
                }
View Full Code Here

TOP

Related Classes of net.yacy.document.Document

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.