Package org.htmlparser

Examples of org.htmlparser.Tag


            NodeList links = nodelist.extractAllNodesThatMatch(filter);
            for (NodeIterator ni = links.elements(); ni.hasMoreNodes(); ) {
                Node node = ni.nextNode();
                if (node instanceof Tag) {
                    boolean got = false;
                    Tag tag = (Tag) node;
                    String src = tag.getAttribute("src");
                    if (src != null) {
                        processLink(base, src);
                        got = true;
                    }
                    String href = tag.getAttribute("href");
                    if (href != null) {
                        processLink(base, href);
                        got = true;
                    }
                    if (!got) {
View Full Code Here


            Node node = e.nextNode();
            // a url is always in a Tag.
            if (!(node instanceof Tag)) {
                continue;
            }
            Tag tag = (Tag) node;
            String tagname=tag.getTagName();
            String binUrlStr = null;

            // first we check to see if body tag has a
            // background set
            if (tag instanceof BodyTag) {
                binUrlStr = tag.getAttribute(ATT_BACKGROUND);
            } else if (tag instanceof BaseHrefTag) {
                BaseHrefTag baseHref = (BaseHrefTag) tag;
                String baseref = baseHref.getBaseUrl();
                try {
                    if (!baseref.equals(""))// Bugzilla 30713
                    {
                        baseUrl.url = ConversionUtils.makeRelativeURL(baseUrl.url, baseHref.getBaseUrl());
                    }
                } catch (MalformedURLException e1) {
                    throw new HTMLParseException(e1);
                }
            } else if (tag instanceof ImageTag) {
                ImageTag image = (ImageTag) tag;
                binUrlStr = image.getImageURL();
            } else if (tag instanceof AppletTag) {
                // look for applets

                // This will only work with an Applet .class file.
                // Ideally, this should be upgraded to work with Objects (IE)
                // and archives (.jar and .zip) files as well.
                AppletTag applet = (AppletTag) tag;
                binUrlStr = applet.getAppletClass();
            } else if (tag instanceof InputTag) {
                // we check the input tag type for image
                if (ATT_IS_IMAGE.equalsIgnoreCase(tag.getAttribute(ATT_TYPE))) {
                    // then we need to download the binary
                    binUrlStr = tag.getAttribute(ATT_SRC);
                }
            } else if (tag instanceof LinkTag) {
                LinkTag link = (LinkTag) tag;
                if (link.getChild(0) instanceof ImageTag) {
                    ImageTag img = (ImageTag) link.getChild(0);
                    binUrlStr = img.getImageURL();
                }
            } else if (tag instanceof ScriptTag) {
                binUrlStr = tag.getAttribute(ATT_SRC);
            } else if (tag instanceof FrameTag) {
                binUrlStr = tag.getAttribute(ATT_SRC);
            } else if (tagname.equalsIgnoreCase(TAG_EMBED)
                || tagname.equalsIgnoreCase(TAG_BGSOUND)){
                binUrlStr = tag.getAttribute(ATT_SRC);
            } else if (tagname.equalsIgnoreCase(TAG_LINK)) {
                // Putting the string first means it works even if the attribute is null
                if (STYLESHEET.equalsIgnoreCase(tag.getAttribute(ATT_REL))) {
                    binUrlStr = tag.getAttribute(ATT_HREF);
                }
            } else {
                binUrlStr = tag.getAttribute(ATT_BACKGROUND);
            }

            if (binUrlStr != null) {
                urls.addURL(binUrlStr, baseUrl.url);
            }

            // Now look for URLs in the STYLE attribute
            String styleTagStr = tag.getAttribute(ATT_STYLE);
            if(styleTagStr != null) {
                HtmlParsingUtils.extractStyleURLs(baseUrl.url, urls, styleTagStr);
            }

            // second, if the tag was a composite tag,
View Full Code Here

            Node node = e.nextNode();
            // a url is always in a Tag.
            if (!(node instanceof Tag)) {
                continue;
            }
            Tag tag = (Tag) node;

            // Only check form tags
            if (tag instanceof FormTag) {
                // Find the action / form url
                String action = tag.getAttribute("action");
                String acceptCharSet = tag.getAttribute("accept-charset");
                if(action != null && action.length() > 0) {
                    // We use the page encoding where the form resides, as the
                    // default encoding for the form
                    String formCharSet = pageEncoding;
                    // Check if we found an accept-charset attribute on the form
View Full Code Here

            Node node = e.nextNode();
            // a url is always in a Tag.
            if (!(node instanceof Tag)) {
                continue;
            }
            Tag tag = (Tag) node;

            // Only check form tags
            if (tag instanceof FormTag) {
                // Find the action / form url
                String action = tag.getAttribute("action");
                String acceptCharSet = tag.getAttribute("accept-charset");
                if(action != null && action.length() > 0) {
                    // We use the page encoding where the form resides, as the
                    // default encoding for the form
                    String formCharSet = pageEncoding;
                    // Check if we found an accept-charset attribute on the form
View Full Code Here

            Node node = e.nextNode();
            // a url is always in a Tag.
            if (!(node instanceof Tag)) {
                continue;
            }
            Tag tag = (Tag) node;
            String tagname=tag.getTagName();
            String binUrlStr = null;

            // first we check to see if body tag has a
            // background set
            if (tag instanceof BodyTag) {
                binUrlStr = tag.getAttribute(ATT_BACKGROUND);
            } else if (tag instanceof BaseHrefTag) {
                BaseHrefTag baseHref = (BaseHrefTag) tag;
                String baseref = baseHref.getBaseUrl().toString();
                try {
                    if (!baseref.equals(""))// Bugzilla 30713
                    {
                        baseUrl.url = new URL(baseUrl.url, baseHref.getBaseUrl());
                    }
                } catch (MalformedURLException e1) {
                    throw new HTMLParseException(e1);
                }
            } else if (tag instanceof ImageTag) {
                ImageTag image = (ImageTag) tag;
                binUrlStr = image.getImageURL();
            } else if (tag instanceof AppletTag) {
            // look for applets

            // This will only work with an Applet .class file.
            // Ideally, this should be upgraded to work with Objects (IE)
            // and archives (.jar and .zip) files as well.
                AppletTag applet = (AppletTag) tag;
                binUrlStr = applet.getAppletClass();
            } else if (tag instanceof InputTag) {
                // we check the input tag type for image
                if (ATT_IS_IMAGE.equalsIgnoreCase(tag.getAttribute(ATT_TYPE))) {
                    // then we need to download the binary
                    binUrlStr = tag.getAttribute(ATT_SRC);
                }
            } else if (tag instanceof LinkTag) {
                LinkTag link = (LinkTag) tag;
                if (link.getChild(0) instanceof ImageTag) {
                    ImageTag img = (ImageTag) link.getChild(0);
                    binUrlStr = img.getImageURL();
                }
            } else if (tag instanceof ScriptTag) {
                binUrlStr = tag.getAttribute(ATT_SRC);
            } else if (tag instanceof FrameTag) {
                binUrlStr = tag.getAttribute(ATT_SRC);
            } else if (tagname.equalsIgnoreCase(TAG_EMBED)
                || tagname.equalsIgnoreCase(TAG_BGSOUND)){
                binUrlStr = tag.getAttribute(ATT_SRC)
            } else if (tagname.equalsIgnoreCase(TAG_LINK)) {
                // Putting the string first means it works even if the attribute is null
                if (STYLESHEET.equalsIgnoreCase(tag.getAttribute(ATT_REL))) {
                    binUrlStr = tag.getAttribute(ATT_HREF);
                }
            } else {
                binUrlStr = tag.getAttribute(ATT_BACKGROUND);
            }

            if (binUrlStr != null) {
                urls.addURL(binUrlStr, baseUrl.url);
            }

            // Now look for URLs in the STYLE attribute
            String styleTagStr = tag.getAttribute(ATT_STYLE);
            if(styleTagStr != null) {
              HtmlParsingUtils.extractStyleURLs(baseUrl.url, urls, styleTagStr);
            }

            // second, if the tag was a composite tag,
View Full Code Here

    int begin = getBeginOffset(tag);
    int end = begin;
    if (tagClosed) {
      end = getEndOffset(tag);
    } else {
      Tag endTag = tag.getEndTag();
      if (endTag != null) {
        end = getEndOffset(endTag);
      } else {
        end = getEndOffset(tag);
        tagStillOpen = true;
View Full Code Here

            Node node = e.nextNode();
            // a url is always in a Tag.
            if (!(node instanceof Tag)) {
                continue;
            }
            Tag tag = (Tag) node;
            String tagname=tag.getTagName();
            String binUrlStr = null;

            // first we check to see if body tag has a
            // background set
            if (tag instanceof BodyTag) {
                binUrlStr = tag.getAttribute(ATT_BACKGROUND);
            } else if (tag instanceof BaseHrefTag) {
                BaseHrefTag baseHref = (BaseHrefTag) tag;
                String baseref = baseHref.getBaseUrl();
                try {
                    if (!baseref.equals(""))// Bugzilla 30713
                    {
                        baseUrl.url = ConversionUtils.makeRelativeURL(baseUrl.url, baseref);
                    }
                } catch (MalformedURLException e1) {
                    throw new HTMLParseException(e1);
                }
            } else if (tag instanceof ImageTag) {
                ImageTag image = (ImageTag) tag;
                binUrlStr = image.getImageURL();
            } else if (tag instanceof AppletTag) {
                // look for applets

                // This will only work with an Applet .class file.
                // Ideally, this should be upgraded to work with Objects (IE)
                // and archives (.jar and .zip) files as well.
                AppletTag applet = (AppletTag) tag;
                binUrlStr = applet.getAppletClass();
            } else if (tag instanceof ObjectTag) {
                // look for Objects
                ObjectTag applet = (ObjectTag) tag;
                String data = applet.getAttribute(ATT_CODEBASE);
                if(!StringUtils.isEmpty(data)) {
                    binUrlStr = data;              
                }
               
                data = applet.getAttribute(ATT_DATA);
                if(!StringUtils.isEmpty(data)) {
                    binUrlStr = data;                   
                }
               
            } else if (tag instanceof InputTag) {
                // we check the input tag type for image
                if (ATT_IS_IMAGE.equalsIgnoreCase(tag.getAttribute(ATT_TYPE))) {
                    // then we need to download the binary
                    binUrlStr = tag.getAttribute(ATT_SRC);
                }
            } else if (tag instanceof ScriptTag) {
                binUrlStr = tag.getAttribute(ATT_SRC);
                // Bug 51750
            } else if (tag instanceof FrameTag || tagname.equalsIgnoreCase(TAG_IFRAME)) {
                binUrlStr = tag.getAttribute(ATT_SRC);
            } else if (tagname.equalsIgnoreCase(TAG_EMBED)
                || tagname.equalsIgnoreCase(TAG_BGSOUND)){
                binUrlStr = tag.getAttribute(ATT_SRC);
            } else if (tagname.equalsIgnoreCase(TAG_LINK)) {
                // Putting the string first means it works even if the attribute is null
                if (STYLESHEET.equalsIgnoreCase(tag.getAttribute(ATT_REL))) {
                    binUrlStr = tag.getAttribute(ATT_HREF);
                }
            } else {
                binUrlStr = tag.getAttribute(ATT_BACKGROUND);
            }

            if (binUrlStr != null) {
                urls.addURL(binUrlStr, baseUrl.url);
            }

            // Now look for URLs in the STYLE attribute
            String styleTagStr = tag.getAttribute(ATT_STYLE);
            if(styleTagStr != null) {
                HtmlParsingUtils.extractStyleURLs(baseUrl.url, urls, styleTagStr);
            }

            // second, if the tag was a composite tag,
View Full Code Here

     * @return <code>true</code> if the node has the attribute
     * (and value if that is being checked too), <code>false</code> otherwise.
     */
    public boolean accept (Node node)
    {
        Tag tag;
        Attribute attribute;
        boolean ret;

        ret = false;
        if (node instanceof Tag)
        {
            tag = (Tag)node;
            attribute = tag.getAttributeEx (mAttribute);
            ret = null != attribute;
            if (ret && (null != mValue))
                ret = mValue.equals (attribute.getValue ());
        }

View Full Code Here

    protected void doSAX (Node node)
        throws
            ParserException,
            SAXException
    {
        Tag tag;
        Tag end;

        if (node instanceof Remark)
        {
            String text = mParser.getLexer ().getPage ().getText (node.getStartPosition (), node.getEndPosition ());
            mContentHandler.ignorableWhitespace (text.toCharArray (), 0, text.length ());
        }
        else if (node instanceof Text)
        {
            String text = mParser.getLexer ().getPage ().getText (node.getStartPosition (), node.getEndPosition ());
            mContentHandler.characters (text.toCharArray (), 0, text.length ());
        }
        else if (node instanceof Tag)
        {
            tag = (Tag)node;
            if (mNameSpaces)
                mSupport.processName (tag.getTagName (), mParts, false);
            else
            {
                mParts[0] = "";
                mParts[1] = "";
            }
            if (mNameSpacePrefixes)
                mParts[2] = tag.getTagName ();
            else if (mNameSpaces)
                mParts[2] = "";
            else
                mParts[2] = tag.getTagName ();

            mContentHandler.startElement (
                mParts[0], // uri
                mParts[1], // local
                mParts[2], // raw
                new Attributes (tag, mSupport, mParts));
            NodeList children = tag.getChildren ();
            if (null != children)
                for (int i = 0; i < children.size (); i++)
                    doSAX (children.elementAt (i));
            end = tag.getEndTag ();
            if (null != end)
            {
                if (mNameSpaces)
                    mSupport.processName (end.getTagName (), mParts, false);
                else
                {
                    mParts[0] = "";
                    mParts[1] = "";
                }
                if (mNameSpacePrefixes)
                    mParts[2] = end.getTagName ();
                else if (mNameSpaces)
                    mParts[2] = "";
                else
                    mParts[2] = end.getTagName ();
                mContentHandler.endElement (
                    mParts[0], // uri
                    mParts[1], // local
                    mParts[2]); // raw
            }
View Full Code Here

     * @return The next node in the HTML stream, or null if there are no more nodes.
     * @exception ParserException If an unrecoverable error occurs.
     */
    public Node nextNode () throws ParserException
    {
        Tag tag;
        Scanner scanner;
        NodeList stack;
        Node ret;

        try
        {
            ret = mLexer.nextNode ();
            if (null != ret)
            {
                // kick off recursion for the top level node
                if (ret instanceof Tag)
                {
                    tag = (Tag)ret;
                    if (!tag.isEndTag ())
                    {
                        // now recurse if there is a scanner for this type of tag
                        scanner = tag.getThisScanner ();
                        if (null != scanner)
                        {
                            stack = new NodeList ();
                            ret = scanner.scan (tag, mLexer, stack);
                        }
View Full Code Here

TOP

Related Classes of org.htmlparser.Tag

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.