Package net.yacy.document.parser.html

Source Code of net.yacy.document.parser.html.ContentTransformer

// ContentTransformer.java
// ---------------------------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// $LastChangedDate: 2011-05-27 10:24:54 +0200 (Fr, 27. Mai 2011) $
// $LastChangedRevision: 7746 $
// $LastChangedBy: orbiter $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package net.yacy.document.parser.html;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Properties;
import java.util.TreeSet;

import net.yacy.cora.document.ASCII;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;

public class ContentTransformer extends AbstractTransformer implements Transformer {
   
    // statics: for initialization of the HTMLFilterAbstractTransformer
    private static final TreeSet<String> linkTags0 = new TreeSet<String>(ASCII.insensitiveASCIIComparator);
    private static final TreeSet<String> linkTags1 = new TreeSet<String>(ASCII.insensitiveASCIIComparator);

    static {
        linkTags0.add("img");
        linkTags0.add("input");

        linkTags1.add("a");
    }

    private ArrayList<String> bluelist = null;

    public ContentTransformer() {
        super(linkTags0, linkTags1);
    }

    public void init(final String initarg) {
        if (bluelist == null) {
            // here, the init arg is used to load a list of blue-listed words
            bluelist = new ArrayList<String>();
            final File f = new File(initarg);
            if (f.canRead()) {
                try {
                    final BufferedReader r = new BufferedReader(new FileReader(f));
                    String s;
                    while ((s = r.readLine()) != null) {
                        if (s.length() > 0 && s.charAt(0) != '#') bluelist.add(s.toLowerCase());
                    }
                    r.close();
                } catch (final IOException e) {
                }
                // if (bluelist.isEmpty()) System.out.println("BLUELIST is empty");
            }
        }
    }

    public boolean isIdentityTransformer() {
        return bluelist.isEmpty();
    }

    private static char[] genBlueLetters(int length) {
            final CharBuffer bb = new CharBuffer(" <FONT COLOR=#0000FF>".toCharArray());
            length = length / 2;
            if (length > 10) length = 7;
            while (length-- > 0) {
                bb.append((int)'X');
            }
            bb.append("</FONT> ");
            final char[] result = bb.getChars();
            try {
        bb.close();
      } catch (IOException e) {
          Log.logException(e);
      }
            return result;
    }

    private boolean bluelistHit(final char[] text) {
        if (text == null || bluelist == null) return false;
        final String lc = new String(text).toLowerCase();
        for (int i = 0; i < bluelist.size(); i++) {
            if (lc.indexOf(bluelist.get(i)) >= 0) return true;
        }
        return false;
    }
   
    public char[] transformText(final char[] text) {
        if (bluelist != null) {
            if (bluelistHit(text)) {
                // System.out.println("FILTERHIT: " + text);
                return genBlueLetters(text.length);
            }
            return text;
        }
        return text;
    }

    @Override
    public char[] transformTag0(final String tagname, final Properties tagopts, final char quotechar) {
        if (tagname.equals("img")) {
            // check bluelist
            if (bluelistHit(tagopts.getProperty("src", "").toCharArray())) return genBlueLetters(5);
            if (bluelistHit(tagopts.getProperty("alt", "").toCharArray())) return genBlueLetters(5);

            // replace image alternative name
            tagopts.setProperty("alt", new String(transformText(tagopts.getProperty("alt", "").toCharArray())));
        }
        if (tagname.equals("input") && (tagopts.getProperty("type") != null && tagopts.getProperty("type").equals("submit"))) {
            // rewrite button name
            tagopts.setProperty("value", new String(transformText(tagopts.getProperty("value", "").toCharArray())));
        }
        return TransformerWriter.genTag0(tagname, tagopts, quotechar);
    }

    @Override
    public char[] transformTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
        if (bluelistHit(tagopts.getProperty("href","").toCharArray())) return genBlueLetters(text.length);
        if (bluelistHit(text)) return genBlueLetters(text.length);
        return TransformerWriter.genTag1(tagname, tagopts, text, quotechar);
    }

    @Override
    public void close() {
        // free resources
        super.close();
    }

}
TOP

Related Classes of net.yacy.document.parser.html.ContentTransformer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.