Package net.yacy.document

Examples of net.yacy.document.Document


   
    public Document[] parse(final MultiProtocolURI location, final String mimeType,
            final String charset, final InputStream source1)
            throws Parser.Failure, InterruptedException {

        final Document[] docs = new Document[]{new Document(
                location,
                mimeType,
                charset,
                this,
                null,
View Full Code Here


            log.logWarning(ex.getMessage());
        } catch (IOException ex) {
            log.logWarning(ex.getMessage());
        }

        return new Document[]{new Document(
            location,
            mimeType,
            "UTF-8",
            this,
            null,
View Full Code Here

        final URIMetadataRow.Components metadata = entry.metadata();
        if (metadata == null || metadata.url() == null) return 0;

        try {
            // parse the resource
            final Document document = Document.mergeDocuments(metadata.url(), null, loader.loadDocuments(loader.request(metadata.url(), true, false), cacheStrategy, 10000, Long.MAX_VALUE));
            if (document == null) {
                // delete just the url entry
                urlMetadata().remove(urlhash);
                return 0;
            }
View Full Code Here

        for (int i = 1; i <= 6; i++) {
            for (final String headline : scraper.getHeadlines(i)) {
                sections[p++] = headline;
            }
        }
        final Document[] ppds = new Document[]{new Document(
                location,
                mimeType,
                charSet,
                scraper,
                scraper.getContentLanguages(),
View Full Code Here

      return null;
    }
  }

  public String autoTag(final String url, final int max, final TreeMap<String, YMarkTag> tags) {
    final Document document = loadDocument(url);
    final TreeSet<YMarkTag> topwords = new TreeSet<YMarkTag>();
    // final TreeMap<String, YMarkTag> pairs = new TreeMap<String, YMarkTag>();

    String token;
    // StringBuilder pair = new StringBuilder(64);

    if(document != null) {
      //get words from document
      final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib).words();

      // generate potential tags from document title, description and subject
      final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
      final StringBuilder buffer = new StringBuilder(bufferSize);
      buffer.append(document.dc_title());
      buffer.append(document.dc_description());
      buffer.append(document.dc_subject(' '));
      final Enumeration<String> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);

      int count = 0;

      // loop through potential tag and rank them
View Full Code Here

            if (title.length() == l) break;
            l = title.length();
        }

        Document[] docs;
        docs = new Document[]{new Document(
                  location,
                  mimeType,
                  "UTF-8",
                  this,
                  null,
View Full Code Here

            throw new Parser.Failure("Load error:" + e.getMessage(), url);
        }
       
        final List<Document> docs = new ArrayList<Document>();
        MultiProtocolURI uri;
        Document doc;
        for (final URLEntry item: sitemap) try {
            uri = new MultiProtocolURI(item.loc);
            doc = new Document(
                    uri,
                    TextParser.mimeOf(url),
                    charset,
                    this,
                    null,
View Full Code Here

            if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
           
            // create the parser document
            Document[] docs = null;
            final byte[] contentBytes = UTF8.getBytes(writer.toString());
            docs = new Document[]{new Document(
                    location,
                    mimeType,
                    "UTF-8",
                    this,
                    languages,
View Full Code Here

        // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
        // the great number of these objects can easily be seen in Java Visual VM
        // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
        COSName.clearResources();
        PDFont.clearResources();
        return new Document[]{new Document(
                location,
                mimeType,
                "UTF-8",
                this,
                null,
View Full Code Here

                System.out.println(pdfFile.getAbsolutePath());
                final long startTime = System.currentTimeMillis();

                // parse
                final AbstractParser parser = new pdfParser();
                Document document = null;
                try {
                    document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new FileInputStream(pdfFile)));
                } catch (final Parser.Failure e) {
                    System.err.println("Cannot parse file " + pdfFile.getAbsolutePath());
                    Log.logException(e);
                } catch (final InterruptedException e) {
                    System.err.println("Interrupted while parsing!");
                    Log.logException(e);
                } catch (final NoClassDefFoundError e) {
                    System.err.println("class not found: " + e.getMessage());
                } catch (final FileNotFoundException e) {
                    Log.logException(e);
                }

                // statistics
                System.out.println("\ttime elapsed: " + (System.currentTimeMillis() - startTime) + " ms");

                // output
                if (document == null) {
                    System.out.println("\t!!!Parsing without result!!!");
                } else {
                    System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
                    try {
                        // write file
                        FileUtils.copy(document.getText(), new File("parsedPdf.txt"));
                    } catch (final IOException e) {
                        System.err.println("error saving parsed document");
                        Log.logException(e);
                    }
                }
View Full Code Here

TOP

Related Classes of net.yacy.document.Document

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.