Package info.bliki.wiki.filter

Source Code of info.bliki.wiki.filter.WikipediaParser

package info.bliki.wiki.filter;

import info.bliki.commons.validator.routines.EmailValidator;
import info.bliki.htmlcleaner.ContentToken;
import info.bliki.htmlcleaner.EndTagToken;
import info.bliki.htmlcleaner.TagNode;
import info.bliki.htmlcleaner.TagToken;
import info.bliki.wiki.model.Configuration;
import info.bliki.wiki.model.DefaultEventListener;
import info.bliki.wiki.model.IEventListener;
import info.bliki.wiki.model.ITableOfContent;
import info.bliki.wiki.model.IWikiModel;
import info.bliki.wiki.tags.HTMLBlockTag;
import info.bliki.wiki.tags.HTMLTag;
import info.bliki.wiki.tags.HrTag;
import info.bliki.wiki.tags.PTag;
import info.bliki.wiki.tags.WPBoldItalicTag;
import info.bliki.wiki.tags.WPPreTag;
import info.bliki.wiki.tags.WPTag;
import info.bliki.wiki.tags.util.Attribute;
import info.bliki.wiki.tags.util.IBodyTag;
import info.bliki.wiki.tags.util.INoBodyParsingTag;
import info.bliki.wiki.tags.util.NodeAttribute;
import info.bliki.wiki.tags.util.TagStack;
import info.bliki.wiki.tags.util.WikiTagNode;

import java.util.List;

/**
* A Wikipedia syntax parser for the second pass in the parsing of a Wikipedia
* source text.
*
* @see TemplateParser for the first pass
*/
public class WikipediaParser extends AbstractParser implements IParser {
  private ITableOfContent fTableOfContentTag = null;

  private int fHeadCounter = 0;

  /**
   * Enable HTML tags
   */
  private boolean fHtmlCodes = true;

  private boolean fNoToC = false;

  private boolean fRenderTemplate = false;

  private boolean fForceToC = false;

  private IEventListener fEventListener = null;

  public WikipediaParser(String stringSource, boolean renderTemplate) {
    this(stringSource, renderTemplate, null);
  }

  public WikipediaParser(String stringSource, boolean renderTemplate, IEventListener wikiListener) {
    super(stringSource);
    fRenderTemplate = renderTemplate;
    if (wikiListener == null) {
      fEventListener = DefaultEventListener.CONST;
    } else {
      fEventListener = wikiListener;
    }
  }

  /**
   * Copy the read ahead content in the resulting HTML text token.
   *
   * @param diff
   *          subtract <code>diff</code> form the current parser position to get
   *          the HTML text token end position.
   */
  private boolean createPreContentToken(final int diff) {
    if (fWhiteStart) {
      try {
        final int count = fCurrentPosition - diff - fWhiteStartPosition;
        if (count > 0) {
          String rawWikiText = fStringSource.substring(fWhiteStartPosition, fWhiteStartPosition + count);
          WikipediaPreTagParser.parseRecursive(rawWikiText, fWikiModel);
          fWhiteStart = false;
        }
        return true;
      } catch (InvalidPreWikiTag ipwt) {
      }
    }
    return false;
  }

  public int getNextToken() // throws InvalidInputException
  {
    fWhiteStart = true;
    fWhiteStartPosition = fCurrentPosition;
    try {
      while (true) {
        fCurrentCharacter = fSource[fCurrentPosition++];

        // ---------Identify the next token-------------
        switch (fCurrentCharacter) {
        case '\n':
          // check at the end of line, if there is open wiki bold or italic
          // markup
          reduceTokenStackBoldItalic();
          break;
        case '{':
          // dummy parsing of wikipedia templates for event listeners
          if (parseTemplate()) {
          } else {
            // wikipedia table handling
            if (parseTable()) {
              continue;
            }
          }
          break;
        case '_': // TOC identifiers __NOTOC__, __FORCETOC__ ...
          if (parseSpecialIdentifiers()) {
            continue;
          }
          break;
        case '=': // wikipedia header ?
          if (parseSectionHeaders()) {
            continue;
          }
          break;
        case WPList.DL_DD_CHAR: // start of <dl><dd> list
        case WPList.DL_DT_CHAR: // start of <dl><dt> list
        case WPList.OL_CHAR: // start of <ol> list
        case WPList.UL_CHAR: // start of <ul> list
          if (parseLists()) {
            continue;
          }
          break;
        // case ':':
        // if (parseSimpleDefinitionLists()) {
        // continue;
        // }
        // break;
        // case ';':
        // if (parseDefinitionLists()) {
        // continue;
        // }
        // break;
        case '-': // parse ---- as <hr>
          if (parseHorizontalRuler()) {
            continue;
          }
          break;
        case ' ': // pre-formatted text?
        case '\t':
          if (parsePreformattedWikiBlock()) {
            continue;
          }
          break;
        }

        if (isStartOfLine() && fWikiModel.getRecursionLevel() == 1) {
          if (isEmptyLine(1)) {
            if (fWikiModel.stackSize() > 0 && (fWikiModel.peekNode() instanceof PTag)) {
              // close <p> tag:
              createContentToken(2);
              fWikiModel.reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN);
            }
          } else {
            if (fWikiModel.stackSize() == 0) {
              addParagraph();
              // if (fWikiModel.getRecursionLevel() == 1) {
              // addParagraph();
              // } else {
              // if (fCurrentPosition > 1) {
              // addParagraph();
              // }
              // }
            } else {
              TagToken tag = fWikiModel.peekNode();
              if (tag instanceof WPPreTag) {
                addPreformattedText();
                // } else if (tag instanceof PTag) {
                // createContentToken(fWhiteStart, fWhiteStartPosition, 2);
                // reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN);
              } else {
                String allowedParents = Configuration.HTML_PARAGRAPH_OPEN.getParents();
                if (allowedParents != null) {
                  int index = -1;
                  index = allowedParents.indexOf("|" + tag.getName() + "|");
                  if (index >= 0) {
                    addParagraph();
                  }
                }
              }
            }
          }
        }

        // ---------Identify the next token-------------
        switch (fCurrentCharacter) {
        case '[':
          if (parseWikiLink()) {
            continue;
          }
          break;
        case '\'':
          if (getNextChar('\'')) {
            if (getNextChar('\'')) {
              if (getNextChar('\'')) {
                if (getNextChar('\'')) {
                  createContentToken(5);
                  return TokenBOLDITALIC;
                }
                fCurrentPosition -= 1;
                fWhiteStart = true;
                createContentToken(3);
                return TokenBOLD;
              }
              createContentToken(3);
              return TokenBOLD;
            }
            createContentToken(2);
            return TokenITALIC;
          }
          break;
        case '<':
          if (fHtmlCodes) {
            int htmlStartPosition = fCurrentPosition;
            // HTML tags are allowed
            try {
              switch (fStringSource.charAt(fCurrentPosition)) {
              case '!': // <!-- HTML comment -->
                if (parseHTMLCommentTags()) {
                  continue;
                }
                break;
              default:

                if (fSource[fCurrentPosition] != '/') {
                  // opening HTML tag
                  WikiTagNode tagNode = parseTag(fCurrentPosition);
                  if (tagNode != null) {
                    String tagName = tagNode.getTagName();
                    TagToken tag = fWikiModel.getTokenMap().get(tagName);
                    if (tag != null) {
                      tag = (TagToken) tag.clone();

                      if (tag instanceof TagNode) {
                        TagNode node = (TagNode) tag;
                        List<NodeAttribute> attributes = tagNode.getAttributesEx();
                        Attribute attr;
                        String temp;
                        for (int i = 1; i < attributes.size(); i++) {
                          attr = attributes.get(i);
                          temp = attr.getValue();
                          if (temp != null) {
                            temp = parseNowiki(temp);
                          }
                          node.addAttribute(attr.getName(), temp, true);
                        }
                      }
                      if (tag instanceof HTMLTag) {
                        ((HTMLTag) tag).setTemplate(isTemplate());
                      }

                      createContentToken(1);

                      fCurrentPosition = fScannerPosition;

                      String allowedParents = tag.getParents();
                      if (allowedParents != null) {
                        fWikiModel.reduceTokenStack(tag);
                      }
                      createTag(tag, tagNode, tagNode.getEndPosition());
                      return TokenIgnore;
                    } else {
                      fWhiteStart = true;
                      skipUntilEndOfTag(tagNode, tagNode.getEndPosition());
                      createContentToken(0);
                      return TokenIgnore;
                    }
                    // break;
                  }
                } else {
                  // closing HTML tag
                  WikiTagNode tagNode = parseTag(++fCurrentPosition);
                  if (tagNode != null) {
                    String tagName = tagNode.getTagName();
                    TagToken tag = fWikiModel.getTokenMap().get(tagName);
                    if (tag != null) {
                      createContentToken(2);
                      fCurrentPosition = fScannerPosition;

                      if (fWikiModel.stackSize() > 0) {
                        TagToken topToken = fWikiModel.peekNode();
                        if (topToken.getName().equals(tag.getName())) {
                          fWikiModel.popNode();
                          return TokenIgnore;
                        } else {
                          if (tag.isReduceTokenStack()) {
                            reduceStackUntilToken(tag);
                          }
                        }
                      } else {
                      }
                      return TokenIgnore;
                    }
                    break;
                  }
                }
              }
            } catch (IndexOutOfBoundsException e) {
              // do nothing
            }
            fCurrentPosition = htmlStartPosition;
          }
          break;
        default:
          if (Character.isLetter(fCurrentCharacter)) {
            if (fCurrentPosition < 2 || !Character.isLetterOrDigit(fSource[fCurrentPosition - 2])) {
              if (fCurrentCharacter == 'i' || fCurrentCharacter == 'I') {
                // ISBN ?
                if (parseISBNLinks()) {
                  continue;
                }
              }

              if (parseURIScheme()) {
                // a URI scheme registered in the wiki model (ftp, http,
                // https,...)
                continue;
              }

              if (fWikiModel.isCamelCaseEnabled() && Character.isUpperCase(fCurrentCharacter)
                  && fWikiModel.getRecursionLevel() <= 1) {
                if (parseCamelCaseLink()) {
                  continue;
                }
              }
            }
          }
        }

        if (!fWhiteStart) {
          fWhiteStart = true;
          fWhiteStartPosition = fCurrentPosition - 1;
        }

      }
      // -----------------end switch while try--------------------
    } catch (IndexOutOfBoundsException e) {
      // end of scanner text
    }
    try {
      createContentToken(1);
    } catch (IndexOutOfBoundsException e) {
      // end of scanner text
    }
    return TokenEOF;
  }

  /**
   * Parse nowiki tags.
   *
   * @param input
   * @return
   */
  private String parseNowiki(String input) {
    int indx = input.indexOf("<nowiki>");
    int indx2;
    int lastIndx = 0;
    if (indx >= 0) {
      StringBuilder buf = new StringBuilder(input.length());
      while (indx >= 0) {
        buf.append(input.substring(lastIndx, indx));
        lastIndx = indx + 8; // <nowiki> length
        indx2 = input.indexOf("</nowiki>", indx + 1);
        if (indx2 >= 0) {
          buf.append(input.substring(lastIndx, indx2));
          lastIndx = indx2 + 9;// </nowiki> length
        } else {
          break;
        }
        indx = input.indexOf("<nowiki>", indx2 + 1);
      }
      buf.append(input.substring(lastIndx, input.length()));
      return buf.toString();
    }
    return input;
  }

  private void addParagraph() {
    createContentToken(2);
    fWikiModel.reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN);
    fWikiModel.pushNode(new PTag());
  }

  /**
   * Add the content of the wiki &lt;pre&gt; block. Trim the content at the
   * right side.
   */
  private void addPreformattedText() {
    if (fWhiteStart) {
      int currentPos = fCurrentPosition;
      int whiteEndPosition = fCurrentPosition - 2;
      while (whiteEndPosition > fWhiteStartPosition) {
        if (!Character.isWhitespace(fSource[whiteEndPosition])) {
          whiteEndPosition++;
          break;
        }
        whiteEndPosition--;
      }
      try {
        fCurrentPosition = whiteEndPosition;
        createContentToken(0);
      } finally {
        fCurrentPosition = currentPos;
      }
    }
    fWikiModel.reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN);
    fWikiModel.pushNode(new PTag());
  }

  private boolean parseHTMLCommentTags() {
    int htmlStartPosition = fCurrentPosition;
    String htmlCommentString = fStringSource.substring(fCurrentPosition - 1, fCurrentPosition + 3);

    if (htmlCommentString.equals("<!--")) {
      fCurrentPosition += 3;
      if (readUntil("-->")) {
        String htmlCommentContent = fStringSource.substring(htmlStartPosition + 3, fCurrentPosition - 3);
        if (htmlCommentContent != null) {
          createContentToken(fCurrentPosition - htmlStartPosition + 1);
          return true;
        }
      }
    }
    return false;
  }

  private boolean parseISBNLinks() {
    final int urlStartPosition = fCurrentPosition;
    boolean foundISBN = false;
    try {
      if ((fCurrentCharacter == 'i' || fCurrentCharacter == 'I')
          && (fSource[fCurrentPosition] == 's' || fSource[fCurrentPosition] == 'S')
          && (fSource[++fCurrentPosition] == 'b' || fSource[fCurrentPosition] == 'B')
          && (fSource[++fCurrentPosition] == 'n' || fSource[fCurrentPosition] == 'N') && fSource[++fCurrentPosition] == ' ') {
        fCurrentPosition++;
        createContentToken(5);
        foundISBN = true;
        char ch;
        ch = fSource[fCurrentPosition++];
        while ((ch >= '0' && ch <= '9') || ch == '-') {
          ch = fSource[fCurrentPosition++];
        }
      }
    } catch (IndexOutOfBoundsException e) {
    }
    if (foundISBN) {
      String urlString = fStringSource.substring(urlStartPosition - 1, fCurrentPosition - 1);
      fCurrentPosition--;
      fWikiModel.appendISBNLink(urlString);
      return true;
    }
    // rollback work :-)
    fCurrentPosition = urlStartPosition;
    return false;
  }

  private boolean parseMailtoLinks() {
    final int urlStartPosition = fCurrentPosition;
    int tempPosition = fCurrentPosition;
    boolean foundUrl = false;
    try {
      if ((fCurrentCharacter == 'm' || fCurrentCharacter == 'M')
          && (fSource[fCurrentPosition] == 'a' || fSource[fCurrentPosition] == 'A')
          && (fSource[++fCurrentPosition] == 'i' || fSource[fCurrentPosition] == 'I')
          && (fSource[++fCurrentPosition] == 'l' || fSource[fCurrentPosition] == 'L')
          && (fSource[++fCurrentPosition] == 't' || fSource[fCurrentPosition] == 'T')
          && (fSource[++fCurrentPosition] == 'o' || fSource[fCurrentPosition] == 'O')) {
        tempPosition += 6;
        fCurrentCharacter = fSource[tempPosition++];

        foundUrl = true;
        while (!Character.isWhitespace(fSource[tempPosition++])) {
        }
      }
    } catch (IndexOutOfBoundsException e) {
    }
    if (foundUrl) {
      String urlString = fStringSource.substring(urlStartPosition - 1, tempPosition - 1);
      String email = urlString.substring(7);
      if (EmailValidator.getInstance().isValid(email)) {
        createContentToken(5);
        fWhiteStart = false;
        fCurrentPosition = tempPosition;
        fCurrentPosition--;
        fWikiModel.appendMailtoLink(urlString, urlString, true);
        return true;
      }

    }
    // rollback work :-)
    fCurrentPosition = urlStartPosition;
    return false;
  }

  /**
   * See <a href="http://en.wikipedia.org/wiki/URI_scheme">URI scheme</a>
   *
   * @return <code>true</code> if a registered URI scheme was found in the wiki
   *         models configuration..
   */
  private boolean parseURIScheme() {
    if (fCurrentCharacter == 'm' || fCurrentCharacter == 'M') {
      // mailto ?
      if (parseMailtoLinks()) {
        return true;
      }
    }
    int urlStartPosition = fCurrentPosition;
    int tempPosition = fCurrentPosition;
    String uriSchemeName = "";
    int index = -1;
    boolean foundUrl = false;
    try {
      index = indexOfUntilNoLetter(':', fCurrentPosition);
      if (index > 0) {
        uriSchemeName = fStringSource.substring(fCurrentPosition - 1, index);
        if (fWikiModel.isValidUriScheme(uriSchemeName)) {
          // found something like "ftp", "http", "https"
          tempPosition += uriSchemeName.length() + 1;
          fCurrentCharacter = fSource[tempPosition++];

          createContentToken(1);
          fWhiteStart = false;
          foundUrl = true;
          while (Encoder.isUrlIdentifierPart(fSource[tempPosition++])) {
          }

        }
      }
    } catch (IndexOutOfBoundsException e) {
    }
    if (foundUrl) {
      String restString = fStringSource.substring(urlStartPosition - 1, tempPosition - 1);
      String uriSchemeSpecificPart = fStringSource.substring(index + 1, tempPosition - 1);
      if (fWikiModel.isValidUriSchemeSpecificPart(uriSchemeName, uriSchemeSpecificPart)) {
        fWhiteStart = false;
        fCurrentPosition = tempPosition;
        fCurrentPosition--;
        fWikiModel.appendExternalLink(uriSchemeName, restString, restString, true);
        return true;
      }

    }
    // rollback work :-)
    fCurrentPosition = urlStartPosition;
    return false;
  }

  private boolean parseCamelCaseLink() {
    int startLinkPosition = fCurrentPosition - 1;
    int temp = fCurrentPosition;
    boolean isCamelCase = false;
    try {
      char ch = fSource[temp++];
      while (Character.isLetterOrDigit(ch)) {
        if (Character.isUpperCase(ch)) {
          // at least 2 upper case characters appear in the word
          isCamelCase = true;
        }
        ch = fSource[temp++];
      }
    } catch (IndexOutOfBoundsException iobe) {
    }

    if (isCamelCase) {
      createContentToken(1);
      fWhiteStart = false;
      fCurrentPosition = temp - 1;

      String name = fStringSource.substring(startLinkPosition, fCurrentPosition);
      fWikiModel.appendInternalLink(name, null, name, null, false);
      return true;
    }

    return false;
  }

  /**
   * Parse a wiki section starting with a '[' character
   *
   * @return <code>true</code> if a correct link was found
   */
  private boolean parseWikiLink() {
    int startLinkPosition = fCurrentPosition;
    if (getNextChar('[')) {
      return parseWikiTag();
    } else if (getNextCharAsWhitespace()) {
      fCurrentPosition--;
      return false;
    } else {
      createContentToken(1);
      fWhiteStart = false;

      if (readUntilCharOrStopAtEOL(']')) {
        String name = fStringSource.substring(startLinkPosition, fCurrentPosition - 1);

        // bbcode start
        if (fWikiModel.parseBBCodes() && name.length() > 0) {
          // parse start tokens like phpBB forum syntax style (bbcode)
          char ch = name.charAt(0);
          if ('a' <= ch && ch <= 'z') {
            // first character must be a letter
            StringBuilder bbCode = new StringBuilder(name.length());
            bbCode.append(ch);
            if (parsePHPBBCode(name, bbCode)) {
              return true;
            }
          }
        }
        // bbcode end

        if (handleHTTPLink(name)) {
          return true;
        }
      }
      fCurrentPosition = startLinkPosition;
    }
    return false;
  }

  /**
   * Parse a wiki section starting with a '[[' sequence
   *
   * @return <code>true</code> if a correct link was found
   */
  private boolean parseWikiTag() {
    int startLinkPosition = fCurrentPosition;
    int endLinkPosition;
    // wikipedia link style
    createContentToken(2);

    int temp = fCurrentPosition;
    if (findWikiLinkEnd()) {
      endLinkPosition = fCurrentPosition - 2;
      String name = fStringSource.substring(startLinkPosition, endLinkPosition);
      // test for a suffix string behind the Wiki link. Useful for plurals.
      // Example:
      // Dolphins are [[aquatic mammal]]s that are closely related to [[whale]]s
      // and [[porpoise]]s.
      temp = fCurrentPosition;
      String suffix = "";
      try {
        fCurrentCharacter = fSource[fCurrentPosition];
        if (Character.isLowerCase(fCurrentCharacter)) {
          fCurrentPosition++;
          StringBuilder suffixBuffer = new StringBuilder(16);
          suffixBuffer.append(fCurrentCharacter);
          while (true) {
            fCurrentCharacter = fSource[fCurrentPosition++];
            if (!Character.isLowerCase(fCurrentCharacter)) {
              fCurrentPosition--;
              break;
            }
            suffixBuffer.append(fCurrentCharacter);
          }
          suffix = suffixBuffer.toString();
        }
      } catch (IndexOutOfBoundsException e) {
        fCurrentPosition = temp;
      }
      fEventListener.onWikiLink(fSource, startLinkPosition, endLinkPosition, suffix);
      if (!fWikiModel.appendRawWikipediaLink(name, suffix)) {
        fCurrentPosition = temp;
      }
      return true;
    } else {
      fWhiteStart = true;
      fWhiteStartPosition = startLinkPosition - 2;
      fCurrentPosition = temp + 1;
    }
    return false;
  }

  private boolean parsePreformattedWikiBlock() {
    if (isStartOfLine() && !isEmptyLine(1)) {
      if (fWikiModel.stackSize() == 0 || !(fWikiModel.peekNode() instanceof HTMLBlockTag)
          || (fWikiModel.peekNode() instanceof PTag)) {
        createContentToken(2);
        fWikiModel.reduceTokenStack(Configuration.HTML_PRE_OPEN);

        // don't use Configuration.HTML_PRE_OPEN here
        // rendering differs between these tags!
        fWikiModel.pushNode(new WPPreTag());

        char ch = ' ';
        try {
          while (ch == ' ' || ch == '\t') {
            // SPACE or TAB => check if it's a pre-formatted text
            fWhiteStart = true;
            fWhiteStartPosition = fCurrentPosition;
            ch = fSource[fCurrentPosition++];
            while (ch != '\n' && fCurrentPosition < fSource.length) {
              ch = fSource[fCurrentPosition++];
            }
            if (fCurrentPosition == fSource.length) {
              // scanner reached end of text
              if (!createPreContentToken(0)) {
                fCurrentPosition = fWhiteStartPosition;
                fSource[fWhiteStartPosition - 1] = '\n';
                return false;
              }
            } else {
              ch = fSource[fCurrentPosition++];
              if (ch == ' ' || ch == '\t') {
                if (!createPreContentToken(1)) {
                  fCurrentPosition = fWhiteStartPosition;
                  fSource[fWhiteStartPosition - 1] = '\n';
                  return false;
                }
              } else {
                // skip the newline character at the end of the pre-formatted
                // block
                if (!createPreContentToken(2)) {
                  fCurrentPosition = fWhiteStartPosition;
                  fSource[fWhiteStartPosition - 1] = '\n';
                  return false;
                } else {
                  fCurrentPosition--;
                  return true;
                }
              }
            }

          }
        } catch (IndexOutOfBoundsException e) {
          fCurrentPosition--;
        } finally {
          fWikiModel.popNode();
        }

      }
      return true;
    }
    return false;
  }

  /**
   * Parse <code>----</code> as &lt;hr&gt; tag
   *
   * @return
   */
  private boolean parseHorizontalRuler() {
    if (isStartOfLine()) {
      int tempCurrPosition = fCurrentPosition;
      try {
        if (fSource[tempCurrPosition++] == '-' && fSource[tempCurrPosition++] == '-' && fSource[tempCurrPosition++] == '-') {
          int pos = isEndOfLine('-', tempCurrPosition);
          if (pos > 0) {
            HrTag hr = new HrTag();
            createContentToken(2);
            fWikiModel.reduceTokenStack(hr);
            fCurrentPosition = pos;
            fWikiModel.append(hr);
            fWhiteStart = false;
            return true;
          }
        }
      } catch (IndexOutOfBoundsException e) {

      }
      fCurrentPosition = tempCurrPosition;
    }
    return false;
  }

  /**
   * Parse a wiki list <br/>
   * <br/>
   * Example:<br/>
   *
   * <pre>
   * * first line
   * * second line
   * ** third line
   * </pre>
   *
   * @return
   */
  private boolean parseLists() {
    // set scanner pointer to '\n' character:
    if (isStartOfLine()) {
      setPosition(fCurrentPosition - 2);
      WPList list = wpList();
      if (list != null && !list.isEmpty()) {
        createContentToken(1);
        fWikiModel.reduceTokenStack(list);
        fCurrentPosition = getPosition() - 1;
        fWikiModel.append(list);
        return true;
      }
    }
    return false;
  }

  /**
   * Parses a wiki header line into &quot;h1, h2, h3, h4, h5, h6&quot; HTML
   * tags. <br/>
   * <br/>
   * Example wiki syntax header line: <br/>
   * <code>== Test header 2 ==</code>
   *
   * @return <code>true</code> if a header line could be parsed correctly,
   *         <code>false</code> otherwise.
   */
  private boolean parseSectionHeaders() {
    if (isStartOfLine()) {
      int headerStartPosition = fCurrentPosition - 1;
      int endIndex = fStringSource.indexOf("\n", fCurrentPosition);
      if (endIndex < 0) {
        endIndex = fStringSource.length();
      }
      int headerEndPosition = endIndex;
      char ch;
      while (headerEndPosition > 0) {
        ch = fSource[--headerEndPosition];
        if (!Character.isWhitespace(ch)) {
          break;
        }
      }
      if (headerEndPosition < 0 || headerEndPosition <= headerStartPosition) {
        return false;
      }
      int level = 0;
      int startPosition = headerStartPosition;
      int endPosition = headerEndPosition + 1;
      while (headerStartPosition < headerEndPosition) {
        if (fSource[headerStartPosition] == '=' && fSource[headerEndPosition] == '=') {
          level++;
          headerStartPosition++;
          headerEndPosition--;
        } else {
          headerEndPosition++;
          break;
        }
      }
      if (level == 0) {
        return false;
      }
      if (level > 6) {
        level = 6;
      }
      createContentToken(1);
      reduceTokenStack();
      String head = "";
      if (headerEndPosition >= headerStartPosition) {
        if (headerEndPosition > headerStartPosition) {
          head = fStringSource.substring(headerStartPosition, headerEndPosition);
        } else {
          head = String.valueOf(fStringSource.charAt(headerStartPosition));
        }
      }
      fEventListener.onHeader(fSource, startPosition, endPosition, headerStartPosition, headerEndPosition, level);
      fCurrentPosition = endIndex;

      if (head != null) {
        fTableOfContentTag = fWikiModel.appendHead(head, level, fNoToC, ++fHeadCounter, startPosition, endPosition);
      }
      return true;
    }
    return false;
  }

  private boolean parseTable() {
    if (isStartOfLine()) {
      // wiki table ?
      setPosition(fCurrentPosition - 1);
      WPTable table = wpTable(fTableOfContentTag);
      if (table != null) {
        createContentToken(1);
        fWikiModel.reduceTokenStack(table);
        // set pointer behind: "\n|}"
        fCurrentPosition = getPosition();
        fWikiModel.append(table);
        // table.filter(fSource, fWikiModel);
        return true;
      }
    }
    return false;
  }

  private boolean parseTemplate() {
    // dummy parsing of Wikipedia templates for event listeners
    // doesn't change fCurrentPosition
    if (fSource[fCurrentPosition] == '{') {
      int templateStartPosition = fCurrentPosition + 1;
      if (fSource[templateStartPosition] != '{') {
        int templateEndPosition = findNestedTemplateEnd(fSource, templateStartPosition);
        if (templateEndPosition > 0) {
          fEventListener.onTemplate(fSource, templateStartPosition, templateEndPosition - 2);
          return true;
        }
      }
    }
    return false;
  }

  /**
   * Parse special identifiers like __TOC__, __NOTOC__, __FORCETOC__
   *
   * @return
   */
  private boolean parseSpecialIdentifiers() {
    if (fSource[fCurrentPosition] == '_') {
      fCurrentPosition++;
      int tocEndPosition = fCurrentPosition;
      char ch;
      while (true) {
        ch = fSource[tocEndPosition++];
        if (ch >= 'A' && ch <= 'Z') {
          continue;
        }
        break;
      }
      if (ch == '_' && fSource[tocEndPosition] == '_') {
        String tocIdent = fStringSource.substring(fCurrentPosition, tocEndPosition - 1);
        if (fWikiModel.parseBehaviorSwitch(tocIdent)) {
          createContentToken(2);
          fCurrentPosition = tocEndPosition + 1;
          return true;
        }
        boolean tocRecognized = false;
        for (int i = 0; i < TOC_IDENTIFIERS.length; i++) {
          if (TOC_IDENTIFIERS[i].equals(tocIdent)) {
            createContentToken(2);
            tocRecognized = true;
            fCurrentPosition = tocEndPosition + 1;
            switch (i) {
            case 0: // TOC
              fTableOfContentTag = fWikiModel.createTableOfContent(true);
              fForceToC = true;
              break;
            case 1: // NOTOC
              setNoToC(true);
              break;
            case 2: // FORCETOC
              fForceToC = true;
              break;
            }
            break;
          }
        }
        if (tocRecognized) {
          return true;
        }
      }
    }
    return false;
  }

  /**
   * Check if the scanners cursor position is at the beginning of a line.
   *
   * @return <code>true</code> if the scanners cursor points to the beginning of
   *         a line, <code>false</code> otherwise.
   */
  private boolean isStartOfLine() {
    if (fCurrentPosition >= 2) {
      if (fSource[fCurrentPosition - 2] == '\n') {
        return true;
      }
    } else if (fCurrentPosition == 1) {
      return true;
    }
    return false;
  }

  private int isEndOfLine(char testChar, int currentPosition) {
    int tempPosition = currentPosition;
    try {
      char ch;
      while (true) {
        ch = fSource[tempPosition];
        if (ch != testChar) {
          break;
        }
        tempPosition++;
      }
      while (true) {
        ch = fSource[tempPosition++];
        if (ch == '\n') {
          return tempPosition;
        } else if (!Character.isWhitespace(ch)) {
          return -1;
        }
      }
    } catch (IndexOutOfBoundsException e) {

    }
    return -1;
  }

  private void createTag(TagToken tag, WikiTagNode tagNode, int startMacroPosition) {
    String endTag;
    String macroBodyString = "";
    int index0;
    String command = tagNode.getTagName();
    if ((tag != null) && (tag instanceof IBodyTag) && (!tagNode.isEmptyXmlTag())) {
      endTag = command + '>';
      index0 = Util.indexOfIgnoreCase(fStringSource, "</", endTag, startMacroPosition);

      if (index0 >= 0) {
        macroBodyString = fStringSource.substring(startMacroPosition, index0);
        fCurrentPosition = index0 + endTag.length() + 2;
      } else {
        macroBodyString = fStringSource.substring(startMacroPosition, fSource.length);
        fCurrentPosition = fSource.length;
      }
    } else {
      macroBodyString = null;
      fCurrentPosition = startMacroPosition;
    }

    handleTag(tag, tagNode, macroBodyString);
  }

  private void skipUntilEndOfTag(WikiTagNode tagNode, int startMacroPosition) {
    String endTag;
    int index0;
    String command = tagNode.getTagName();
    if (!tagNode.isEmptyXmlTag()) {
      endTag = command + '>';
      index0 = Util.indexOfIgnoreCase(fStringSource, "</", endTag, startMacroPosition);
      if (index0 >= 0) {
        fCurrentPosition = index0 + endTag.length() + 2;
      } else {
        fCurrentPosition = fSource.length;
      }
    }
  }

  private boolean handleHTTPLink(String name) {
    String urlString;
    String uriSchemeName = "";
    if (name != null) {
      boolean isEmail = false;

      int index = -1;
      boolean foundUrl = false;
      boolean protocolRelativeURL = false;

      urlString = name.trim();
      if (urlString.length() >= 2 && urlString.charAt(0) == '/' && urlString.charAt(1) == '/') {
        // issue 89
        foundUrl = true;
        protocolRelativeURL = true;
      } else {

        try {
          index = urlString.indexOf(':', 1);
          if (index > 0) {
            uriSchemeName = urlString.substring(0, index);
            if (uriSchemeName.equalsIgnoreCase("mailto")) {
              isEmail = true;
              foundUrl = true;
            } else {
              if (fWikiModel.isValidUriScheme(uriSchemeName)) {
                foundUrl = true;
              }
            }
          }
        } catch (IndexOutOfBoundsException e) {
        }
      }

      if (foundUrl) {
        // Wikipedia link style: name separated by space?
        int pipeIndex = urlString.indexOf(' ');
        String alias = "";
        if (pipeIndex != (-1)) {
          alias = urlString.substring(pipeIndex + 1);
          urlString = urlString.substring(0, pipeIndex);
        } else {
          if (protocolRelativeURL) {
            alias = urlString.substring(2);
          } else {
            alias = urlString;
          }
        }

        if (isEmail) {
          String email;
          if (pipeIndex > 7) {
            email = urlString.substring(7, pipeIndex);
          } else {
            email = urlString.substring(7);
          }
          if (EmailValidator.getInstance().isValid(email)) {
            fWikiModel.appendMailtoLink(urlString, alias, false);
            return true;
          }
        } else {
          if (protocolRelativeURL) {
            fWikiModel.appendExternalLink(uriSchemeName, urlString, alias, false);
            return true;
          }
          parseURIScheme();
          String uriSchemeSpecificPart = urlString.substring(index + 1);
          if (fWikiModel.isValidUriSchemeSpecificPart(uriSchemeName, uriSchemeSpecificPart)) {
            fWikiModel.appendExternalLink(uriSchemeName, urlString, alias, false);
            return true;
          }
        }

      }
    }
    return false;
  }

  private void handleTag(TagToken tag, WikiTagNode tagNode, String bodyString) {
    String command = tagNode.getTagName();
    try {
      if (tag instanceof EndTagToken) {
        fWikiModel.append(tag);
      } else {
        fWikiModel.pushNode(tag);
        if (null != bodyString) {
          if (tag instanceof INoBodyParsingTag) {
            ((TagNode) tag).addChild(new ContentToken(bodyString));
          } else {
            // recursively filter tags within the tags body string
            WikipediaParser.parseRecursive(bodyString.trim(), fWikiModel, false, true);
          }
        }
        if (tag instanceof IBodyTag) {
          fWikiModel.popNode();
        }
      }
    } catch (IllegalArgumentException e) {
      TagNode divTagNode = new TagNode("div");
      divTagNode.addAttribute("class", "error", true);
      divTagNode.addChild(new ContentToken("IllegalArgumentException: " + command + " - " + e.getMessage()));
      fWikiModel.append(divTagNode);
      e.printStackTrace();
    } catch (Throwable e) {
      e.printStackTrace();
      TagNode divTagNode = new TagNode("div");
      divTagNode.addAttribute("class", "error", true);
      divTagNode.addChild(new ContentToken(command + ": " + e.getMessage()));
      fWikiModel.append(divTagNode);
      e.printStackTrace();
    }
  }

  @Override
  public void runParser() {
    int token = TokenSTART;
    while ((token = getNextToken()) != TokenEOF) {
      switch (token) {
      case TokenBOLDITALIC:
        if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLDITALIC)) {
          fWikiModel.popNode();
        } else if (fWikiModel.stackSize() > 1 && fWikiModel.peekNode().equals(BOLD)
            && fWikiModel.getNode(fWikiModel.stackSize() - 2).equals(ITALIC)) {
          fWikiModel.popNode();
          fWikiModel.popNode();
        } else if (fWikiModel.stackSize() > 1 && fWikiModel.peekNode().equals(ITALIC)
            && fWikiModel.getNode(fWikiModel.stackSize() - 2).equals(BOLD)) {
          fWikiModel.popNode();
          fWikiModel.popNode();
        } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLD)) {
          fWikiModel.popNode();
          fWikiModel.pushNode(new WPTag("i"));
        } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(ITALIC)) {
          fWikiModel.popNode();
          fWikiModel.pushNode(new WPTag("b"));
        } else {
          fWikiModel.pushNode(new WPBoldItalicTag());
        }
        break;
      case TokenBOLD:
        if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLDITALIC)) {
          fWikiModel.popNode();
          fWikiModel.pushNode(new WPTag("i"));
          // fResultBuffer.append("</b>");
        } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLD)) {
          fWikiModel.popNode();
        } else {
          fWikiModel.pushNode(new WPTag("b"));
        }
        break;
      case TokenITALIC:
        if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLDITALIC)) {
          fWikiModel.popNode();
          fWikiModel.pushNode(new WPTag("b"));
        } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(ITALIC)) {
          fWikiModel.popNode();
        } else {
          fWikiModel.pushNode(new WPTag("i"));
        }
        break;
      }
    }
    reduceTokenStack();

    if (!fNoToC && fTableOfContentTag != null) {
      if (fHeadCounter > 3 || fForceToC) {
        fTableOfContentTag.setShowToC(true);
      }
    }

  }

  public boolean isNoToC() {
    return fNoToC;
  }

  @Override
  public void setNoToC(boolean noToC) {
    fNoToC = noToC;
  }

  /**
   * Call the parser on the first recursion level, where the text can contain a
   * table of contents (TOC).
   *
   * <br/>
   * <br/>
   * <b>Note:</b> in this level the wiki model will call the
   * <code>setUp()</code> method before parsing and the <code>tearDown()</code>
   * method after the parser has finished.
   *
   * @param rawWikitext
   *          the raw text of the article
   * @param wikiModel
   *          a suitable wiki model for the given wiki article text
   * @param parseTemplates
   *          parse the template expansion step
   * @param templateParserBuffer
   *          if the <code>templateParserBuffer != null</code> the
   *          <code>templateParserBuffer</code> will be used to append the
   *          result of the template expansion step
   *
   */
  public static void parse(String rawWikiText, IWikiModel wikiModel, boolean parseTemplates, Appendable templateParserBuffer) {
    try {
      // initialize the wiki model
      wikiModel.setUp();

      if (parseTemplates) {
        Appendable buf;
        if (templateParserBuffer != null) {
          buf = templateParserBuffer;
        } else {
          buf = new StringBuilder(rawWikiText.length() + rawWikiText.length() / 10);
        }
        String pass1Text = null;
        try {
          TemplateParser.parse(rawWikiText, wikiModel, buf, wikiModel.isTemplateTopic());
          pass1Text = buf.toString();
        } catch (Exception ioe) {
          ioe.printStackTrace();
          pass1Text = "<span class=\"error\">TemplateParser exception: " + ioe.getClass().getSimpleName() + "</span>";
        }
        String redirectedLink = AbstractParser.parseRedirect(pass1Text, wikiModel);
        if (redirectedLink == null) {
          parseRecursive(pass1Text, wikiModel, false, false);
        }
      } else {
        if (AbstractParser.parseRedirect(rawWikiText, wikiModel) == null) {
          parseRecursive(rawWikiText, wikiModel, false, false);
        }
      }
    } finally {
      // clean up wiki model if necessary
      wikiModel.tearDown();
    }
  }

  /**
   * Call the parser on the subsequent recursion levels, where the subtexts (of
   * templates, table cells, list items or image captions) don't contain a table
   * of contents (TOC)
   *
   * <b>Note:</b> the wiki model doesn't call the <code>setUp()</code> or
   * <code>tearDown()</code> methods for the subsequent recursive parser steps.
   *
   * @param rawWikitext
   * @param wikiModel
   * @return
   */
  public static void parseRecursive(String rawWikitext, IWikiModel wikiModel) {
    parseRecursive(rawWikitext, wikiModel, false, true);
  }

  /**
   * Call the parser on the subsequent recursion levels, where the subtexts (of
   * templates, table cells, list items or image captions) don't contain a table
   * of contents (TOC)
   *
   * <b>Note:</b> the wiki model doesn't call the <code>setUp()</code> or
   * <code>tearDown()</code> methods for the subsequent recursive parser steps.
   *
   * @param rawWikitext
   * @param wikiModel
   * @param noTOC
   * @param appendStack
   * @return
   * @return
   */
  public static TagStack parseRecursive(String rawWikitext, IWikiModel wikiModel, boolean createOnlyLocalStack, boolean noTOC) {
    AbstractParser parser = wikiModel.createNewInstance(rawWikitext);
    return parser.parseRecursiveInternal(wikiModel, createOnlyLocalStack, noTOC);
  }

  /**
   * Determine if the currently parsed wiki text is a template text.
   *
   * @return <code>true</code> if the currently parsed wiki text is a template
   */
  public boolean isTemplate() {
    return fRenderTemplate;
  }

}
TOP

Related Classes of info.bliki.wiki.filter.WikipediaParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.