diff options
Diffstat (limited to 'pki/base/silent/src/http/HTMLDocument.java')
-rw-r--r-- | pki/base/silent/src/http/HTMLDocument.java | 747 |
1 files changed, 747 insertions, 0 deletions
diff --git a/pki/base/silent/src/http/HTMLDocument.java b/pki/base/silent/src/http/HTMLDocument.java new file mode 100644 index 000000000..5fcb5343b --- /dev/null +++ b/pki/base/silent/src/http/HTMLDocument.java @@ -0,0 +1,747 @@ +// --- BEGIN COPYRIGHT BLOCK --- +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; version 2 of the License. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +// +// (C) 2007 Red Hat, Inc. +// All rights reserved. +// --- END COPYRIGHT BLOCK --- + +import java.io.*; +import java.net.*; +import java.util.*; +import java.util.regex.*; + + + +public class HTMLDocument +{ + // Indicates whether this HTML document has been parsed. + boolean parsed; + + // A list of URLs of files that should be retrieved along with the main + // contents of the document. This may include any images contained in the + // document, and possibly any external stylesheets. + LinkedHashSet associatedFiles; + + // A list of URLs of frames that are contained in the document. + LinkedHashSet documentFrames; + + // A list of URLs of links that are contained in the document. + LinkedHashSet documentLinks; + + // A list of URLs of images that are contained in the document. + LinkedHashSet documentImages; + + // A regular expression pattern that can be used to extract a URI from an HREF + // tag. + Pattern hrefPattern; + + // A regular expression pattern that can be used to extract a URI from a SRC + // tag. + Pattern srcPattern; + + // The base URL for relative links in this document. + String baseURL; + + // The URL that may be used to access this document. + String documentURL; + + // The actual contents of the page. + String htmlData; + + // The contents of the page converted to lowercase for easier matching. + String lowerData; + + // The URL for this document with only protocol, host, and port (i.e., no + // file). + String protocolHostPort; + + // A string buffer containing the contents of the page with tags removed. + StringBuffer textData; + + + // A set of private variables used for internal processing. + private boolean lastElementIsAssociatedFile; + private boolean lastElementIsChunk; + private boolean lastElementIsComment; + private boolean lastElementIsFrame; + private boolean lastElementIsImage; + private boolean lastElementIsLink; + private boolean lastElementIsText; + private int lastElementEndPos; + private int lastElementStartPos; + private String lastURL; + + // constructor that helps to parse without url stuff + public HTMLDocument(String htmlData) + { + this.documentURL = null; + this.htmlData = htmlData; + lowerData = htmlData.toLowerCase(); + associatedFiles = null; + documentLinks = null; + documentImages = null; + textData = null; + parsed = false; + + + // Create the regex patterns that we will use for extracting URIs from tags. + hrefPattern = Pattern.compile(".*?[hH][rR][eE][fF][\\s=\\\"\\']+" + + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL); + srcPattern = Pattern.compile(".*?[sS][rR][cC][\\s=\\\"\\']+" + + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL); + } + + + /** + * Creates a new HTML document using the provided data. + * + * @param documentURL The URL for this document. + * @param htmlData The actual data contained in the HTML document. + */ + public HTMLDocument(String documentURL, String htmlData) + throws MalformedURLException + { + this.documentURL = documentURL; + this.htmlData = htmlData; + lowerData = htmlData.toLowerCase(); + associatedFiles = null; + documentLinks = null; + documentImages = null; + textData = null; + parsed = false; + + + // Create the regex patterns that we will use for extracting URIs from tags. + hrefPattern = Pattern.compile(".*?[hH][rR][eE][fF][\\s=\\\"\\']+" + + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL); + srcPattern = Pattern.compile(".*?[sS][rR][cC][\\s=\\\"\\']+" + + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL); + + URL url = new URL(documentURL); + String urlPath = url.getPath(); + if ((urlPath == null) || (urlPath.length() == 0)) + { + baseURL = documentURL; + protocolHostPort = documentURL; + } + else if (urlPath.equals("/")) + { + baseURL = documentURL; + protocolHostPort = documentURL.substring(0, documentURL.length()-1); + } + else if (urlPath.endsWith("/")) + { + baseURL = documentURL; + + int port = url.getPort(); + if (port > 0) + { + protocolHostPort = url.getProtocol() + "://" + url.getHost() + ":" + + port; + } + else + { + protocolHostPort = url.getProtocol() + "://" + url.getHost(); + } + } + else + { + int port = url.getPort(); + if (port > 0) + { + protocolHostPort = url.getProtocol() + "://" + url.getHost() + ":" + + port; + } + else + { + protocolHostPort = url.getProtocol() + "://" + url.getHost(); + } + + File urlFile = new File(urlPath); + String parentDirectory = urlFile.getParent(); + if ((parentDirectory == null) || (parentDirectory.length() == 0)) + { + parentDirectory = "/"; + } + else if (! parentDirectory.startsWith("/")) + { + parentDirectory = "/" + parentDirectory; + } + + baseURL = protocolHostPort + parentDirectory; + } + + if (! baseURL.endsWith("/")) + { + baseURL = baseURL + "/"; + } + } + + + + /** + * Actually parses the HTML document and extracts useful elements from it. + * + * @return <CODE>true</CODE> if the page could be parsed successfully, or + * <CODE>false</CODE> if not. + */ + public boolean parse() + { + if (parsed) + { + return true; + } + + + try + { + associatedFiles = new LinkedHashSet(); + documentFrames = new LinkedHashSet(); + documentLinks = new LinkedHashSet(); + documentImages = new LinkedHashSet(); + textData = new StringBuffer(); + + lastElementStartPos = 0; + lastElementEndPos = -1; + String element; + while ((element = nextDocumentElement()) != null) + { + if (element.length() == 0) + { + continue; + } + + if (lastElementIsText) + { + char lastChar; + if (textData.length() == 0) + { + lastChar = ' '; + } + else + { + lastChar = textData.charAt(textData.length()-1); + } + char firstChar = element.charAt(0); + if (! ((lastChar == ' ') || (lastChar == '\t') || + (lastChar == '\r') || (lastChar == '\n')) || + (firstChar == ' ') || (firstChar == '\t') || + (firstChar == '\r') || (firstChar == '\n')) + { + textData.append(" "); + } + + textData.append(element); + } + else if (lastElementIsImage) + { + if (lastURL != null) + { + documentImages.add(lastURL); + associatedFiles.add(lastURL); + } + } + else if (lastElementIsFrame) + { + if (lastURL != null) + { + documentFrames.add(lastURL); + associatedFiles.add(lastURL); + } + } + else if (lastElementIsLink) + { + if (lastURL != null) + { + documentLinks.add(lastURL); + } + } + else if (lastElementIsAssociatedFile) + { + if (lastURL != null) + { + associatedFiles.add(lastURL); + } + } + else if (lastElementIsChunk || lastElementIsComment) + { + // Don't need to do anything with this. + } + else + { + // Also don't need anything here. + } + } + + parsed = true; + } + catch (Exception e) + { + associatedFiles = null; + documentLinks = null; + documentImages = null; + textData = null; + parsed = false; + } + + return parsed; + } + + + + /** + * Retrieves the next element from the HTML document. An HTML element can + * include a string of plain text, a single HTML tag, or a larger chunk of + * HTML including a start and end tag, all of which should be considered a + * single element. + */ + private String nextDocumentElement() + { + // If we're at the end of the HTML, then return null. + if (lastElementEndPos >= htmlData.length()) + { + return null; + } + + + // Initialize the variables we will use for the search. + lastElementStartPos = lastElementEndPos+1; + lastElementIsAssociatedFile = false; + lastElementIsChunk = false; + lastElementIsComment = false; + lastElementIsFrame = false; + lastElementIsImage = false; + lastElementIsLink = false; + lastElementIsText = false; + lastURL = null; + + + // Find the location of the next open angle bracket. If there is none, then + // the rest of the document must be plain text. + int openPos = lowerData.indexOf('<', lastElementStartPos); + if (openPos < 0) + { + lastElementEndPos = htmlData.length(); + lastElementIsText = true; + return htmlData.substring(lastElementStartPos); + } + + + // If the location of the next open tag is not we started looking, then read + // everything up to that tag as text. + if (openPos > lastElementStartPos) + { + lastElementEndPos = openPos-1; + lastElementIsText = true; + return htmlData.substring(lastElementStartPos, openPos); + } + + + // The start position is an open tag. See if the tag is actually "<!--", + // which indicates an HTML comment. If that's the case, then find the + // closing "-->". + if (openPos == lowerData.indexOf("<!--", lastElementStartPos)) + { + int closePos = lowerData.indexOf("-->", openPos+1); + if (closePos < 0) + { + // This looks like an unterminated comment. We can't do much else + // here, so just stop parsing. + return null; + } + else + { + lastElementEndPos = closePos + 2; + lastElementIsComment = true; + return htmlData.substring(lastElementStartPos, lastElementEndPos+1); + } + } + + + // Find the location of the next close angle bracket. If there is none, + // then we have an unmatched open tag. What to do here? I guess just treat + // the rest of the document as text. + int closePos = lowerData.indexOf('>', openPos+1); + if (closePos < 0) + { + lastElementEndPos = htmlData.length(); + lastElementIsText = true; + return htmlData.substring(lastElementStartPos); + } + + + // Grab the contents of the tag in both normal and lowercase. + String tag = htmlData.substring(openPos, closePos+1); + String strippedTag = htmlData.substring(openPos+1, closePos).trim(); + StringTokenizer tokenizer = new StringTokenizer(strippedTag, " \t\r\n=\"'"); + lastElementEndPos = closePos; + + if (! tokenizer.hasMoreTokens()) + { + return tag; + } + + String token = tokenizer.nextToken(); + String lowerToken = token.toLowerCase(); + + if (lowerToken.equals("a") || lowerToken.equals("area")) + { + while (tokenizer.hasMoreTokens()) + { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("href")) + { + try + { + Matcher matcher = hrefPattern.matcher(tag); + lastURL = uriToURL(matcher.replaceAll("$1")); + if (lastURL != null) + { + lastElementIsLink = true; + } + } catch (Exception e) {} + break; + } + } + } + else if (lowerToken.equals("base")) + { + while (tokenizer.hasMoreTokens()) + { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("href")) + { + try + { + Matcher matcher = hrefPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + if (! uri.endsWith("/")) + { + uri = uri + "/"; + } + + baseURL = uri; + } catch (Exception e) {} + break; + } + } + } + else if (lowerToken.equals("frame") || lowerToken.equals("iframe") || + lowerToken.equals("input")) + { + while (tokenizer.hasMoreTokens()) + { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("src")) + { + try + { + Matcher matcher = srcPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + lastURL = uriToURL(uri); + if (lastURL != null) + { + lastElementIsFrame = true; + lastElementIsAssociatedFile = true; + } + } catch (Exception e) {} + break; + } + } + } + else if (lowerToken.equals("img")) + { + while (tokenizer.hasMoreTokens()) + { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("src")) + { + try + { + Matcher matcher = srcPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + lastURL = uriToURL(uri); + if (lastURL != null) + { + lastElementIsImage = true; + } + } catch (Exception e) {} + break; + } + } + } + else if (lowerToken.equals("link")) + { + boolean isStyleSheet = false; + + while (tokenizer.hasMoreTokens()) + { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("href")) + { + try + { + Matcher matcher = hrefPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + lastURL = uriToURL(uri); + if (lastURL != null) + { + lastElementIsLink = true; + } + } catch (Exception e) {} + break; + } + else if (token.equalsIgnoreCase("rel")) + { + if (tokenizer.hasMoreTokens()) + { + String relType = tokenizer.nextToken(); + if (relType.equalsIgnoreCase("stylesheet")) + { + isStyleSheet = true; + } + } + } + } + + if (lastURL != null) + { + if (isStyleSheet) + { + lastElementIsAssociatedFile = true; + } + else + { + lastElementIsLink = true; + } + } + } + else if (lowerToken.equals("script")) + { + while (tokenizer.hasMoreTokens()) + { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("src")) + { + try + { + Matcher matcher = srcPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + lastURL = uriToURL(uri); + } catch (Exception e) {} + break; + } + } + + if (lastURL == null) + { + int endScriptPos = lowerData.indexOf("</script>", lastElementEndPos+1); + if (endScriptPos > 0) + { + lastElementEndPos = endScriptPos + 8; + tag = htmlData.substring(lastElementStartPos, lastElementEndPos+1); + lastElementIsChunk = true; + } + } + else + { + lastElementIsAssociatedFile = true; + } + } + + return tag; + } + + + + /** + * Converts the provided URI to a URL. The provided URI may be a URL already, + * or it may also be an absolute path on the server or a path relative to the + * base URL. + * + * @param uri The URI to convert to a URL. + * + * @return The URL based on the provided URI. + */ + private String uriToURL(String uri) + { + String url = null; + + if (uri.indexOf("://") > 0) + { + if (uri.startsWith("http")) + { + url = uri; + } + } + else if (uri.startsWith("/")) + { + url = protocolHostPort + uri; + } + else + { + url = baseURL + uri; + } + + return url; + } + + + + /** + * Retrieves the URL of this HTML document. + * + * @return The URL of this HTML document. + */ + public String getDocumentURL() + { + return documentURL; + } + + + + /** + * Retrieves the original HTML data used to create this document. + * + * @return The orginal HTML data used to create this document. + */ + public String getHTMLData() + { + return htmlData; + } + + + + /** + * Retrieves the contents of the HTML document with all tags removed. + * + * @return The contents of the HTML document with all tags removed, or + * <CODE>null</CODE> if a problem occurs while trying to parse the + * HTML. + */ + public String getTextData() + { + if (! parsed) + { + if (! parse()) + { + return null; + } + } + + return textData.toString(); + } + + + + /** + * Retrieves an array containing a set of URLs parsed from the HTML document + * that reference files that would normally be downloaded as part of + * retrieving a page in a browser. This includes images and external style + * sheets. + * + * @return An array containing a set of URLs to files associated with the + * HTML document, or <CODE>null</CODE> if a problem occurs while + * trying to parse the HTML. + */ + public String[] getAssociatedFiles() + { + if (! parsed) + { + if (! parse()) + { + return null; + } + } + + String[] urlArray = new String[associatedFiles.size()]; + associatedFiles.toArray(urlArray); + return urlArray; + } + + + + /** + * Retrieves an array containing a set of URLs parsed from the HTML document + * that are in the form of links to other content. + * + * @return An array containing a set of URLs parsed from the HTML document + * that are in the form of links to other content, or + * <CODE>null</CODE> if a problem occurs while trying to parse the + * HTML. + */ + public String[] getDocumentLinks() + { + if (! parsed) + { + if (! parse()) + { + return null; + } + } + + String[] urlArray = new String[documentLinks.size()]; + documentLinks.toArray(urlArray); + return urlArray; + } + + + + /** + * Retrieves an array containing a set of URLs parsed from the HTML document + * that reference images used in the document. + * + * @return An array containing a set of URLs parsed from the HTML document + * that reference images used in the document. + */ + public String[] getDocumentImages() + { + if (! parsed) + { + if (! parse()) + { + return null; + } + } + + String[] urlArray = new String[documentImages.size()]; + documentImages.toArray(urlArray); + return urlArray; + } + + + + /** + * Retrieves an array containing a set of URLs parsed from the HTML document + * that reference frames used in the document. + * + * @return An array containing a set of URLs parsed from the HTML document + * that reference frames used in the document. + */ + public String[] getDocumentFrames() + { + if (! parsed) + { + if (! parse()) + { + return null; + } + } + + String[] urlArray = new String[documentFrames.size()]; + documentFrames.toArray(urlArray); + return urlArray; + } +} + |