Package net.yacy.document

Examples of net.yacy.document.Condenser


            documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1));
        } catch (Exception e) {
            throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
        }
        Document document = Document.mergeDocuments(url, null, documents);
        final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
        return super.storeDocument(
                url,
                null,
                new Date(url.lastModified()),
                new Date(),
View Full Code Here


                urlMetadata().remove(urlhash);
                return 0;
            }
            // get the word set
            Set<String> words = null;
            words = new Condenser(document, true, true, null).words().keySet();

            // delete all word references
            int count = 0;
            if (words != null) count = termIndex().remove(Word.words2hashesHandles(words), urlhash);
View Full Code Here

    String token;
    // StringBuilder pair = new StringBuilder(64);

    if(document != null) {
      //get words from document
      final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib).words();

      // generate potential tags from document title, description and subject
      final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
      final StringBuilder buffer = new StringBuilder(bufferSize);
      buffer.append(document.dc_title());
View Full Code Here

    return metadata;
  }

  public TreeMap<String,Word> getWordCounts() {
    if (this.document != null) {
            return sortWordCounts(new Condenser(this.document, true, true, LibraryProvider.dymLib).words());
        }
    return new TreeMap<String, Word>();
  }
View Full Code Here

    public static void main(String[] args) {
        try {
            byte[] b = FileUtils.read(new File(args[0]));
            torrentParser parser = new torrentParser();
            Document[] d = parser.parse(new MultiProtocolURI("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b));
            Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib);
            Map<String, Word> w = c.words();
            for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (Parser.Failure e) {
            e.printStackTrace();
View Full Code Here

        final Condenser[] condenser = new Condenser[in.documents.length];
        if (this.log.isFine()) {
            this.log.logFine("Condensing for '" + in.queueEntry.url().toNormalform(false, true) + "'");
        }
        for (int i = 0; i < in.documents.length; i++) {
            condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib);

            // update image result list statistics
            // its good to do this concurrently here, because it needs a DNS lookup
            // to compute a URL hash which is necessary for a double-check
            final CrawlProfile profile = in.queueEntry.profile();
View Full Code Here

                    if (documents != null) {
                        for (final Document document: documents) {
                            if (document.indexingDenied()) {
                                throw new Parser.Failure("indexing is denied", url);
                            }
                            final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
                            ResultImages.registerImages(url, document, true);
                            Switchboard.this.webStructure.generateCitationReference(url, document, condenser);
                            storeDocumentIndex(process, response, document, condenser, searchEvent, "heuristic:" + heuristicName);
                            Switchboard.this.log.logInfo("addToIndex fill of url " + url.toNormalform(true, true) + " finished");
                        }
View Full Code Here

    public static void main(String[] args) {
        try {
            byte[] b = FileUtils.read(new File(args[0]));
            torrentParser parser = new torrentParser();
            Document[] d = parser.parse(new MultiProtocolURI("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b));
            Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib);
            Map<String, Word> w = c.words();
            for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (Parser.Failure e) {
            e.printStackTrace();
View Full Code Here

    StringBuilder token;

    if(document != null) {

      //get words from document
      final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib).words();
     
      // generate potential tags from document title, description and subject
      final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
      final StringBuilder buffer = new StringBuilder(bufferSize);
      final StringBuilder pwords = new StringBuilder(1000);
View Full Code Here

        final Condenser[] condenser = new Condenser[in.documents.length];
        if (this.log.isFine()) {
            this.log.logFine("Condensing for '" + in.queueEntry.url().toNormalform(false, true) + "'");
        }
        for (int i = 0; i < in.documents.length; i++) {
            condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib);

            // update image result list statistics
            // its good to do this concurrently here, because it needs a DNS lookup
            // to compute a URL hash which is necessary for a double-check
            final CrawlProfile profile = in.queueEntry.profile();
View Full Code Here

TOP

Related Classes of net.yacy.document.Condenser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.