Package org.apache.maven.wagon.shared.http

Source Code of org.apache.maven.wagon.shared.http.HtmlFileListParser

package org.apache.maven.wagon.shared.http;

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*  http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

import org.apache.maven.wagon.TransferFailedException;
import org.codehaus.plexus.util.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;

import java.io.InputStream;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

/**
* Html File List Parser.
*/
public class HtmlFileListParser
{
    /**
     * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
     *
     * @param is the input stream.
     * @return the file list.
     * @throws TransferFailedException if there was a problem fetching the raw html.
     */
    public static List parseFileList( String baseurl, InputStream is )
        throws TransferFailedException
    {
        Tidy tidy = new Tidy();
        tidy.setXHTML( true );
        // Don't care about the warning messages.
        tidy.setErrout( new PrintWriter( new NullOutputStream() ) );
        // Don't care about the cleaned up version of the HTML.
        Document doc = tidy.parseDOM( is, new NullOutputStream() );

        List links = new ArrayList();
        links = findAnchorLinks( links, baseurl, doc );

        return links;
    }

    private static List findAnchorLinks( List links, String baseurl, Node node )
    {
        String basepath = baseurl;

        int colslash = basepath.indexOf( "://" );
        if ( colslash > 0 )
        {
            int pathstart = basepath.indexOf( '/', colslash + 3 );
            if ( pathstart > 0 )
            {
                // slash starts path
                // "http://localhost:10007/test/path/" = "/test/path"
                basepath = baseurl.substring( pathstart );
            }
            else
            {
                // no path means top level.
                // "http://localhost:10007" = ""
                basepath = "";
            }
        }

        if ( StringUtils.equalsIgnoreCase( "a", node.getNodeName() ) )
        {
            if ( node.hasAttributes() )
            {
                String key;
                String value;
                NamedNodeMap attributes = node.getAttributes();
                for ( int i = 0; i < attributes.getLength(); i++ )
                {
                    key = attributes.item( i ).getNodeName().toLowerCase();
                    if ( "href".equals( key ) )
                    {
                        value = attributes.item( i ).getNodeValue();
                        if ( StringUtils.isNotEmpty( value ) )
                        {
                            value = StringUtils.trim( value );
                            if ( validFilename( value ) )
                            {
                                // simple filename.
                                links.add( value );
                            }
                            else
                            {
                                // Potentially Complex Filename.

                                // Starts with full URL base "http://www.ibiblio.org/maven2/"
                                if ( value.startsWith( baseurl ) )
                                {
                                    String tst = value.substring( baseurl.length() );
                                    if ( validFilename( tst ) )
                                    {
                                        links.add( tst );
                                        continue;
                                    }
                                }

                                // Starts with host relative base url "/maven2/"
                                if ( value.startsWith( basepath ) )
                                {
                                    String tst = value.substring( basepath.length() );
                                    if ( validFilename( tst ) )
                                    {
                                        links.add( tst );
                                        continue;
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

        if ( node.hasChildNodes() )
        {
            NodeList nodes = node.getChildNodes();
            for ( int nodenum = 0; nodenum < nodes.getLength(); nodenum++ )
            {
                findAnchorLinks( links, baseurl, nodes.item( nodenum ) );
            }
        }

        return links;
    }

    private static boolean validFilename( String tst )
    {
        final Pattern badFilenames = Pattern.compile( "[:?&@*]" );

        if ( badFilenames.matcher( tst ).find() )
        {
            return false;
        }

        String tstpath = StringUtils.replace( tst, '\\', '/' );
        int pathparts = StringUtils.countMatches( tstpath, "/" );

        if ( pathparts > 1 )
        {
            return false;
        }
        else if ( ( pathparts == 1 ) && ( !tstpath.endsWith( "/" ) ) )
        {
            return false;
        }

        return true;
    }
}
TOP

Related Classes of org.apache.maven.wagon.shared.http.HtmlFileListParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.