Examples of TextExtractor


Examples of au.id.jericho.lib.html.TextExtractor

  }
 
  private static String extractText(String htmlContent) {
    if (htmlContent != null && htmlContent.length() > 0) {
      Source source = new Source(htmlContent);
      TextExtractor extractor = new TextExtractor(source);
      extractor.setConvertNonBreakingSpaces(true);
      extractor.setExcludeNonHTMLElements(false);
      extractor.setIncludeAttributes(false);
      String output = extractor.toString();
      if (output != null && output.length() > 0) {
        return output;
      }
    }
    return null;
View Full Code Here

Examples of it.unimi.dsi.parser.callback.TextExtractor

  private void init() {
    this.parser = new BulletParser();
   
    ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder();
    composedBuilder.add( this.textExtractor = new TextExtractor() );
    composedBuilder.add( this.anchorExtractor = new AnchorExtractor( maxPreAnchor, maxAnchor, maxPostAnchor ) );
    parser.setCallback( composedBuilder.compose() );

    Object o;
    try {
View Full Code Here

Examples of it.unimi.dsi.parser.callback.TextExtractor

  private Set<String> urls;

  public HTMLParser() {
    bulletParser = new BulletParser();
    textExtractor = new TextExtractor();
    linkExtractor = new LinkExtractor();
   
    linkExtractor.setIncludeImagesSources(Configurations
        .getBooleanProperty("crawler.include_images", false));
  }
View Full Code Here

Examples of net.htmlparser.jericho.TextExtractor

            //Search for primary field if present
            try {
                String itemName = getPrimaryNodeType().getPrimaryItemName();
                if (itemName != null) {
                    String s = getProperty(itemName).getValue().getString();
                    title = new TextExtractor(new Source(s != null ? s : getName())).toString();
                }
            } catch (RepositoryException e1) {
                title = null;
            }
        }
View Full Code Here

Examples of org.apache.jackrabbit.extractor.TextExtractor

     * Factory method to create the <code>TextExtractor</code> instance.
     *
     * @return the <code>TextExtractor</code> instance this index should use.
     */
    protected TextExtractor createTextExtractor() {
        TextExtractor txtExtr = new JackrabbitTextExtractor(textFilterClasses);
        if (extractorPoolSize > 0) {
            // wrap with pool
            txtExtr = new PooledTextExtractor(txtExtr, extractorPoolSize,
                    extractorBackLog, extractorTimeout);
        }
View Full Code Here

Examples of org.apache.jackrabbit.extractor.TextExtractor

     * Factory method to create the <code>TextExtractor</code> instance.
     *
     * @return the <code>TextExtractor</code> instance this index should use.
     */
    protected TextExtractor createTextExtractor() {
        TextExtractor txtExtr = new JackrabbitTextExtractor(textFilterClasses);
        if (extractorPoolSize > 0) {
            // wrap with pool
            txtExtr = new PooledTextExtractor(txtExtr, extractorPoolSize,
                    extractorBackLog, extractorTimeout);
        }
View Full Code Here

Examples of org.apache.jackrabbit.extractor.TextExtractor

     * Factory method to create the <code>TextExtractor</code> instance.
     *
     * @return the <code>TextExtractor</code> instance this index should use.
     */
    protected TextExtractor createTextExtractor() {
        TextExtractor txtExtr = new JackrabbitTextExtractor(textFilterClasses);
        if (extractorPoolSize > 0) {
            // wrap with pool
            txtExtr = new PooledTextExtractor(txtExtr, extractorPoolSize,
                    extractorBackLog, extractorTimeout);
        }
View Full Code Here

Examples of org.apache.jackrabbit.extractor.TextExtractor

     * Factory method to create the <code>TextExtractor</code> instance.
     *
     * @return the <code>TextExtractor</code> instance this index should use.
     */
    protected TextExtractor createTextExtractor() {
        TextExtractor txtExtr = new JackrabbitTextExtractor(textFilterClasses);
        if (extractorPoolSize > 0) {
            // wrap with pool
            txtExtr = new PooledTextExtractor(txtExtr, extractorPoolSize,
                    extractorBackLog, extractorTimeout);
        }
View Full Code Here

Examples of org.apache.jackrabbit.extractor.TextExtractor

     * Factory method to create the <code>TextExtractor</code> instance.
     *
     * @return the <code>TextExtractor</code> instance this index should use.
     */
    protected TextExtractor createTextExtractor() {
        TextExtractor txtExtr = new JackrabbitTextExtractor(textFilterClasses);
        if (extractorPoolSize > 0) {
            // wrap with pool
            txtExtr = new PooledTextExtractor(txtExtr, extractorPoolSize,
                    extractorBackLog, extractorTimeout);
        }
View Full Code Here

Examples of org.apache.jackrabbit.extractor.TextExtractor

     * Factory method to create the <code>TextExtractor</code> instance.
     *
     * @return the <code>TextExtractor</code> instance this index should use.
     */
    protected TextExtractor createTextExtractor() {
        TextExtractor txtExtr = new JackrabbitTextExtractor(textFilterClasses);
        if (extractorPoolSize > 0) {
            // wrap with pool
            txtExtr = new PooledTextExtractor(txtExtr, extractorPoolSize,
                    extractorBackLog, extractorTimeout);
        }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.