Examples of BulletParser


Examples of it.unimi.dsi.parser.BulletParser

   
    return super.parseProperty( key, values, metadata );
  }

  private void init() {
    this.parser = new BulletParser();
   
    ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder();
    composedBuilder.add( this.textExtractor = new TextExtractor() );
    composedBuilder.add( this.anchorExtractor = new AnchorExtractor( maxPreAnchor, maxAnchor, maxPostAnchor ) );
    parser.setCallback( composedBuilder.compose() );
View Full Code Here

Examples of it.unimi.dsi.parser.BulletParser

      "fetcher.max_outlinks", 5000);

  private Set<String> urls;

  public HTMLParser() {
    bulletParser = new BulletParser();
    textExtractor = new TextExtractor();
    linkExtractor = new LinkExtractor();
   
    linkExtractor.setIncludeImagesSources(Configurations
        .getBooleanProperty("crawler.include_images", false));
View Full Code Here

Examples of it.unimi.dsi.parser.BulletParser

    public void parse(Page page) {
        for (ParserCallback parserCallback : parserCallbacks) {
            parserCallback.startPage(page);

            BulletParser bulletParser = new BulletParser();
            bulletParser.setCallback(parserCallback);
            bulletParser.parse(page.getContentString().toCharArray());

            parserCallback.endPage(page);
        }
    }
View Full Code Here

Examples of it.unimi.dsi.parser.BulletParser

        final Charset[] charset = {retrieveCharsetFromContentType(contentType)}; // use an array so it can be accessed by inner classes
        if (charset[0] != null)
            return charset[0];

        final BulletParser parser = new BulletParser(HTMLFactory.INSTANCE);

        parser.setCallback(new MyCallback(charset));

        final byte[] content = start >= 0 ? warcRecord.getContent() : null;
        final Reader reader = new InputStreamReader(new ByteArrayInputStream(content, start, content.length-start), Charset.defaultCharset());


        try {
            int read = reader.read(buffer, 0, buffer.length);
            parser.parse(buffer, 0, read);
        } catch (IOException e) {
            LOGGER.error("Error while reading stored HTML file");
        }

        if (charset[0] != null) {
View Full Code Here

Examples of it.unimi.dsi.parser.BulletParser

 
 
  private void init() {
    /** The HTML bullet parser */
    this.parser = new BulletParser(HTMLFactory.INSTANCE);

    ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder();

    composedBuilder
        .add(this.textExtractor = new StructuredTextExtractor());
View Full Code Here

Examples of it.unimi.dsi.parser.BulletParser

                WarcHTMLResponseRecord w = new WarcHTMLResponseRecord(warcRecord);
                if (w.isHTMLResponse()) {
                    //System.out.println(w.getHTMLContent());

                    // See how the parsed content looks like
                    BulletParser parser = new BulletParser(TRECParsingFactory.INSTANCE);
                    ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder();
                    StructuredTextExtractor textExtractor = new StructuredTextExtractor();
                    composedBuilder.add(textExtractor);
                    parser.setCallback(composedBuilder.compose());
                    parser.parse(w.getHTMLContent().toCharArray());
                    System.out.println(textExtractor.getText());
                }
            }
            in.close();
            stream.close();
View Full Code Here

Examples of it.unimi.dsi.parser.BulletParser

    static public QuerySet readTopics(BufferedReader reader,
                                      final boolean quoteCommas) throws IOException {
        logger.debug("Reading a topic file");
        final DefaultQuerySet querySet = new DefaultQuerySet();

        BulletParser bulletParser = new BulletParser(
                TRECParsingFactory.INSTANCE);

        bulletParser.setCallback(new DefaultCallback() {
            TRECTopic topic = null;
            MutableString curText = new MutableString();
            Element curElement;

            @Override
            public boolean characters(char[] text, int offset, int length,
                                      boolean flowBroken) {
                curText.append(text, offset, length);
                return true;
            }

            @Override
            public boolean startElement(Element element,
                                        Map<Attribute, MutableString> attrMapUnused) {

                // --- New tag
                if (topic != null)
                    process();

                // ---
                if (element == ELEMENT_TOP) {
                    topic = new TRECTopic();
                }
                curElement = element;
                return true;
            }

            void removePrefix(String prefix, MutableString text) {
                if (text.startsWith(prefix))
                    text.delete(0, prefix.length());

            }

            private void process() {
                curText.trim();
                curText.replace('\n', ' ');
                curText.squeezeSpaces(false);

                if (curElement == ELEMENT_TITLE) {
                    removePrefix("Topic: ", curText);
                    if (quoteCommas) {
                        StringBuilder builder = new StringBuilder();
                        boolean first = true;
                        for (String part : curText.toString()
                                .split("\\s*,\\s*")) {
                            if (first)
                                first = false;
                            else
                                builder.append(' ');

                            if (part.indexOf(' ') >= 0) {
                                builder.append('"');
                                builder.append(part);
                                builder.append('"');
                            } else
                                builder.append(part);
                        }

                        topic.title = builder.toString();
                    } else
                        topic.title = curText.toString();
                } else if (curElement == ELEMENT_NUM) {
                    removePrefix("Number: ", curText);
                    // Normalise the number
                    topic.id = new Integer(curText.toString()).toString();
                } else if (curElement == ELEMENT_DESC) {
                    removePrefix("Description: ", curText);
                    topic.description = curText.toString();
                } else if (curElement == ELEMENT_NARR) {
                    // TREC
                    removePrefix("Narrative: ", curText);
                    topic.narrative = curText.toString();
                } else if (curElement == ELEMENT_SMRY) {
                    removePrefix("Summary: ", curText);
                    topic.summary = curText.toString();
                } else if (curElement == ELEMENT_CON) {
                    // TREC 1
                    removePrefix("Concepts: ", curText);
                    topic.concepts = curText.toString();
                } else if (curElement == ELEMENT_DEF) {
                    // TREC 1
                    removePrefix("Definition(s): ", curText);
                    removePrefix("Definition: ", curText);
                    topic.definitions = curText.toString();
                }
                curElement = null;
                curText.delete(0, curText.length());
            }

            @Override
            public boolean endElement(Element element) {
                if (topic != null)
                    process();

                if (element == ELEMENT_TOP) {
                    if (topic.id == null) {
                        logger.warn("Topic had no identifier - skipping");
                    } else {
                        logger.debug(String.format("Adding topic %s with title [%s]",
                                topic.id, topic.title));

                        querySet.put(topic.id, topic);
                    }
                    topic = null;
                }
                return true;
            }
        });

        // Read the file & parse
        char text[] = new char[8192];
        int offset = 0, l;
        while ((l = reader.read(text, offset, text.length - offset)) > 0) {
            offset += l;
            text = CharArrays.grow(text, offset + 1);
        }

        bulletParser.parseText(true);
        bulletParser.parseCDATA(true);
        bulletParser.parseTags(true);
        bulletParser.parse(text, 0, offset);

        return querySet;

    }
View Full Code Here

Examples of it.unimi.dsi.parser.BulletParser

    private void init() {
        LOGGER.info("Initialising the TREC document factory");

        // The parser is a SGML BulletParser with TREC vocabulary
        this.parser = new BulletParser(TRECParsingFactory.INSTANCE);

        ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder();

        composedBuilder.add(this.textExtractor = new StructuredTextExtractor());
View Full Code Here

Examples of it.unimi.dsi.parser.BulletParser

  static public DefaultQuerySet readTopics(BufferedReader reader,
                                             final boolean quoteCommas) throws IOException {
    logger.debug("Reading a topic file");
    final DefaultQuerySet querySet = new DefaultQuerySet();

    BulletParser bulletParser = new BulletParser(
        TRECParsingFactory.INSTANCE);

    bulletParser.setCallback(new DefaultCallback() {
      TRECTopic topic = null;
      MutableString curText = new MutableString();
      Element curElement;

      @Override
      public boolean characters(char[] text, int offset, int length,
          boolean flowBroken) {
        curText.append(text, offset, length);
        return true;
      }

      @Override
      public boolean startElement(Element element,
          Map<Attribute, MutableString> attrMapUnused) {

        // --- New tag
        if (topic != null)
          process();

        // ---
        if (element == TRECParsingFactory.ELEMENT_TOP) {
          topic = new TRECTopic();
        }
        curElement = element;
        return true;
      }

      void removePrefix(String prefix, MutableString text) {
        if (text.startsWith(prefix))
          text.delete(0, prefix.length());

      }

      private void process() {
        curText.trim();
        curText.replace('\n', ' ');
        curText.squeezeSpaces(false);

        if (curElement == TRECParsingFactory.ELEMENT_TITLE) {
          removePrefix("Topic: ", curText);
          if (quoteCommas) {
            StringBuilder builder = new StringBuilder();
            boolean first = true;
            for (String part : curText.toString()
                .split("\\s*,\\s*")) {
              if (first)
                first = false;
              else
                builder.append(' ');

              if (part.indexOf(' ') >= 0) {
                builder.append('"');
                builder.append(part);
                builder.append('"');
              } else
                builder.append(part);
            }

            topic.title = builder.toString();
          } else
            topic.title = curText.toString();
        } else if (curElement == TRECParsingFactory.ELEMENT_NUM) {
          removePrefix("Number: ", curText);
          // Normalise the number
          topic.id = new Integer(curText.toString()).toString();
        } else if (curElement == TRECParsingFactory.ELEMENT_DESC) {
          removePrefix("Description: ", curText);
          topic.description = curText.toString();
        } else if (curElement == TRECParsingFactory.ELEMENT_NARR) {
          // TREC
          removePrefix("Narrative: ", curText);
          topic.narrative = curText.toString();
        } else if (curElement == TRECParsingFactory.ELEMENT_SMRY) {
          removePrefix("Summary: ", curText);
          topic.summary = curText.toString();
        } else if (curElement == TRECParsingFactory.ELEMENT_CON) {
          // TREC 1
          removePrefix("Concepts: ", curText);
          topic.concepts = curText.toString();
        } else if (curElement == TRECParsingFactory.ELEMENT_DEF) {
          // TREC 1
          removePrefix("Definition(s): ", curText);
          removePrefix("Definition: ", curText);
          topic.definitions = curText.toString();
        }
        curElement = null;
        curText.delete(0, curText.length());
      }

      @Override
      public boolean endElement(Element element) {
        if (topic != null)
          process();

        if (element == TRECParsingFactory.ELEMENT_TOP) {
          if (topic.id == null) {
            logger.warn("Topic had no identifier - skipping");
          } else {
            logger.debug(new LazyString("Adding topic %s with title [%s]",
                topic.id, topic.title));

            querySet.put(topic.id, topic);
          }
          topic = null;
        }
        return true;
      }
    });

    // Read the file & parse
    char text[] = new char[8192];
    int offset = 0, l;
    while ((l = reader.read(text, offset, text.length - offset)) > 0) {
      offset += l;
      text = CharArrays.grow(text, offset + 1);
    }

    bulletParser.parseText(true);
    bulletParser.parseCDATA(true);
    bulletParser.parseTags(true);
    bulletParser.parse(text, 0, offset);

    return querySet;

  }
View Full Code Here

Examples of it.unimi.dsi.parser.BulletParser

        byte[] docnoBuffer = new byte[512];

        Charset charset = Charset.forName("ISO-8859-1");
        final CharsetDecoder decoder = charset.newDecoder();

        final BulletParser parser = new BulletParser(TRECParsingFactory.INSTANCE);


        final HashSet<Element> elements = new HashSet<>(Arrays.asList(TRECParsingFactory.ELEMENT_TEXT,
                TRECParsingFactory.ELEMENT_HEADLINE, TRECParsingFactory.ELEMENT_LEADPARA,
                TRECParsingFactory.ELEMENT_ABST, TRECParsingFactory.ELEMENT_SUMMARY,
                TRECParsingFactory.ELEMENT_HL));

        final PrintWriter writer = new PrintWriter(System.out);

        parser.setCallback(new Callback() {
            int inContent;
            boolean output;

            @Override
            public void configure(BulletParser parser) {
                parser.parseTags(true);
                parser.parseText(true);
            }

            @Override
            public void startDocument() {
                inContent = 0;
                output = false;
            }

            @Override
            public boolean startElement(Element element, Map<Attribute, MutableString> attributeMutableStringMap) {
                if (elements.contains(element)) {
                    inContent++;
                    if (output)
                       writer.println();
                }
                return true;
            }

            @Override
            public boolean endElement(Element element) {
                if (elements.contains(element)) {
                    inContent--;
                }
                return true;
            }

            @Override
            public boolean characters(char[] chars, int offset, int length, boolean b) {
                if (inContent > 0) {
                    for(int i = offset; i < offset+length; i++) {
                        if (Character.isSpaceChar(chars[i]))
                            writer.write(' ');
                        else
                            writer.write(chars[i]);
                    }
                }
                return true;
            }

            @Override
            public boolean cdata(Element element, char[] chars, int i, int i1) {
                return true;
            }

            @Override
            public void endDocument() {
                if (!output) writer.println();
            }
        });

        TRECDocumentCollection.EventHandler handler = new TRECDocumentCollection.EventHandler() {
            char[] text = new char[8192];
            int offset = 0;

            @Override
            public void startDocument() {
                offset = 0;
            }

            @Override
            public void endDocument(String docno, long currStart, long currStop) throws IOException {
                writer.format("%%%%%% DOCUMENT: %s%n", docno);
                parser.parse(text, 0, offset);
                writer.println();
            }

            @Override
            public void write(byte[] bytes, int offset, int length) {
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.