Examples of AutoDetectReader

jreepad.io.AutoDetectReader
Reads a Jreepad file automatically detecting file type (XML or HJT). @version $Id$
org.apache.tika.detect.AutoDetectReader
An input stream reader that automatically detects the character encoding to be used for converting bytes to characters. @since Apache Tika 1.2

Examples of jreepad.io.AutoDetectReader

    // Load the file to be searched
    JreepadNode root = new JreepadNode();
    try
    {
        InputStream in = new FileInputStream(userFile);
        JreepadReader reader = new AutoDetectReader(encoding, false);
        root = reader.read(in).getRootNode();
    }
    catch(IOException err)
    {
      System.out.println("File input error: " + err);
      System.exit(1);

View Full Code Here

Examples of jreepad.io.AutoDetectReader

  {
    getPrefs().openLocation = file; // Remember the open directory
    try
    {
      InputStream in = new FileInputStream(file);
      JreepadReader reader = new AutoDetectReader(getPrefs().getEncoding(), getPrefs().autoDetectHtmlArticles);
      document = reader.read(in);
      document.setSaveLocation(file);
    }
    catch(IOException e)
    {
      JOptionPane.showMessageDialog(this, e, lang.getString("MSG_LOAD_FILE_FAILED") , JOptionPane.ERROR_MESSAGE);

View Full Code Here

Examples of jreepad.io.AutoDetectReader


        switch(importFormat)
        {
          case FILE_FORMAT_HJT:
            InputStream in = new FileInputStream(getPrefs().importLocation);
            JreepadReader reader = new AutoDetectReader(getPrefs().getEncoding(), getPrefs().autoDetectHtmlArticles);
            theJreepad.addChild(reader.read(in).getRootNode());
            break;
          case FILE_FORMAT_TEXT:
            theJreepad.addChildrenFromTextFiles(fileChooser.getSelectedFiles());
            break;
          case FILE_FORMAT_TEXTASLIST:

View Full Code Here

Examples of org.apache.tika.detect.AutoDetectReader


  @Override
  public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {


    AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER));


    try {
      Charset charset = reader.getCharset();
      String mediaType = metadata.get(Metadata.CONTENT_TYPE);
      String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
      if (mediaType != null && name != null) {
        MediaType type = MediaType.parse(mediaType);
        metadata.set(Metadata.CONTENT_TYPE, type.toString());
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());


        StringBuilder out = new StringBuilder();
        String line;
        int nbLines =  0;
        while ((line = reader.readLine()) != null) {
            out.append(line);
            String author = parserAuthor(line);
            if (author != null) {
              metadata.add(TikaCoreProperties.CREATOR, author);
            }
            nbLines ++;
        }
        metadata.set("LoC", String.valueOf(nbLines));


        Renderer renderer = getRenderer(type.toString());
        String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
        char[] charArray = codeAsHtml.toCharArray();
        handler.startDocument();
        handler.characters(charArray, 0, charArray.length);
        handler.endDocument();
      }
    } finally {
      reader.close();
    }


  }

View Full Code Here

Examples of org.apache.tika.detect.AutoDetectReader

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        // Automatically detect the character encoding
        AutoDetectReader reader = new AutoDetectReader(
                new CloseShieldInputStream(stream), metadata,
                context.get(ServiceLoader.class, LOADER));
        try {
            Charset charset = reader.getCharset();
            String previous = metadata.get(Metadata.CONTENT_TYPE);
            if (previous == null || previous.startsWith("text/html")) {
                MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
                metadata.set(Metadata.CONTENT_TYPE, type.toString());
            }
            // deprecated, see TIKA-431
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());


            // Get the HTML mapper from the parse context
            HtmlMapper mapper =
                    context.get(HtmlMapper.class, new HtmlParserMapper());


            // Parse the HTML document
            org.ccil.cowan.tagsoup.Parser parser =
                    new org.ccil.cowan.tagsoup.Parser();


            // TIKA-528: Reuse share schema to avoid heavy instantiation
            parser.setProperty(
                    org.ccil.cowan.tagsoup.Parser.schemaProperty, HTML_SCHEMA);
            // TIKA-599: Shared schema is thread-safe only if bogons are ignored
            parser.setFeature(
                    org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);


            parser.setContentHandler(new XHTMLDowngradeHandler(
                    new HtmlHandler(mapper, handler, metadata)));


            parser.parse(reader.asInputSource());
        } finally {
            reader.close();
        }
    }

View Full Code Here

Examples of org.apache.tika.detect.AutoDetectReader

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        // Automatically detect the character encoding
        AutoDetectReader reader = new AutoDetectReader(
                new CloseShieldInputStream(stream), metadata, LOADER);
        try {
            Charset charset = reader.getCharset();
            MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
            metadata.set(Metadata.CONTENT_TYPE, type.toString());
            // deprecated, see TIKA-431
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());


            XHTMLContentHandler xhtml =
                    new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();


            xhtml.startElement("p");
            char[] buffer = new char[4096];
            int n = reader.read(buffer);
            while (n != -1) {
                xhtml.characters(buffer, 0, n);
                n = reader.read(buffer);
            }
            xhtml.endElement("p");


            xhtml.endDocument();
        } finally {
            reader.close();
        }
    }

View Full Code Here

Examples of org.apache.tika.detect.AutoDetectReader

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        // Automatically detect the character encoding
        AutoDetectReader reader = new AutoDetectReader(
                new CloseShieldInputStream(stream), metadata, LOADER);
        try {
            Charset charset = reader.getCharset();
            String previous = metadata.get(Metadata.CONTENT_TYPE);
            if (previous == null || previous.startsWith("text/html")) {
                MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
                metadata.set(Metadata.CONTENT_TYPE, type.toString());
            }
            // deprecated, see TIKA-431
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());


            // Get the HTML mapper from the parse context
            HtmlMapper mapper =
                    context.get(HtmlMapper.class, new HtmlParserMapper());


            // Parse the HTML document
            org.ccil.cowan.tagsoup.Parser parser =
                    new org.ccil.cowan.tagsoup.Parser();


            // TIKA-528: Reuse share schema to avoid heavy instantiation
            parser.setProperty(
                    org.ccil.cowan.tagsoup.Parser.schemaProperty, HTML_SCHEMA);
            // TIKA-599: Shared schema is thread-safe only if bogons are ignored
            parser.setFeature(
                    org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);


            parser.setContentHandler(new XHTMLDowngradeHandler(
                    new HtmlHandler(mapper, handler, metadata)));


            parser.parse(reader.asInputSource());
        } finally {
            reader.close();
        }
    }

View Full Code Here

Examples of org.apache.tika.detect.AutoDetectReader

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        // Automatically detect the character encoding
        AutoDetectReader reader = new AutoDetectReader(
                new CloseShieldInputStream(stream), metadata,
                context.get(ServiceLoader.class, LOADER));
        try {
            Charset charset = reader.getCharset();
            MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
            metadata.set(Metadata.CONTENT_TYPE, type.toString());
            // deprecated, see TIKA-431
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());


            XHTMLContentHandler xhtml =
                    new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();


            xhtml.startElement("p");
            char[] buffer = new char[4096];
            int n = reader.read(buffer);
            while (n != -1) {
                xhtml.characters(buffer, 0, n);
                n = reader.read(buffer);
            }
            xhtml.endElement("p");


            xhtml.endDocument();
        } finally {
            reader.close();
        }
    }

View Full Code Here

Examples of org.apache.tika.detect.AutoDetectReader

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        // Automatically detect the character encoding
        AutoDetectReader reader = new AutoDetectReader(
                new CloseShieldInputStream(stream), metadata,
                context.get(ServiceLoader.class, LOADER));
        try {
            Charset charset = reader.getCharset();
            String previous = metadata.get(Metadata.CONTENT_TYPE);
            if (previous == null || previous.startsWith("text/html")) {
                MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
                metadata.set(Metadata.CONTENT_TYPE, type.toString());
            }
            // deprecated, see TIKA-431
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());


            // Get the HTML mapper from the parse context
            HtmlMapper mapper =
                    context.get(HtmlMapper.class, new HtmlParserMapper());


            // Parse the HTML document
            org.ccil.cowan.tagsoup.Parser parser =
                    new org.ccil.cowan.tagsoup.Parser();


            // Use schema from context or default
            Schema schema = context.get(Schema.class, HTML_SCHEMA);


            // TIKA-528: Reuse share schema to avoid heavy instantiation
            parser.setProperty(
                    org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
            // TIKA-599: Shared schema is thread-safe only if bogons are ignored
            parser.setFeature(
                    org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);


            parser.setContentHandler(new XHTMLDowngradeHandler(
                    new HtmlHandler(mapper, handler, metadata)));


            parser.parse(reader.asInputSource());
        } finally {
            reader.close();
        }
    }

View Full Code Here

Examples of org.apache.tika.detect.AutoDetectReader


  @Override
  public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {


    AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER));


    try {
      Charset charset = reader.getCharset();
      String mediaType = metadata.get(Metadata.CONTENT_TYPE);
      String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
      if (mediaType != null && name != null) {
        MediaType type = MediaType.parse(mediaType);
        metadata.set(Metadata.CONTENT_TYPE, type.toString());
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());


        StringBuilder out = new StringBuilder();
        String line;
        int nbLines =  0;
        while ((line = reader.readLine()) != null) {
            out.append(line + System.getProperty("line.separator"));
            String author = parserAuthor(line);
            if (author != null) {
              metadata.add(TikaCoreProperties.CREATOR, author);
            }
            nbLines ++;
        }
        metadata.set("LoC", String.valueOf(nbLines));
        Renderer renderer = getRenderer(type.toString());
        
        String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
        
        Schema schema = context.get(Schema.class, HTML_SCHEMA);


        org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
        parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
        parser.setContentHandler(handler);
        parser.parse(new InputSource(new StringReader(codeAsHtml)));
      }
    } finally {
      reader.close();
    }


  }

View Full Code Here

0 1

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.