From 621d9e5c413e561293d7484b93882d985b3fe15f Mon Sep 17 00:00:00 2001 From: Endi Sukma Dewata Date: Sat, 24 Mar 2012 02:27:47 -0500 Subject: Removed unnecessary pki folder. Previously the source code was located inside a pki folder. This folder was created during svn migration and is no longer needed. This folder has now been removed and the contents have been moved up one level. Ticket #131 --- .../com/netscape/pkisilent/http/HTMLDocument.java | 595 +++++++++++++++++++++ 1 file changed, 595 insertions(+) create mode 100644 base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java (limited to 'base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java') diff --git a/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java b/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java new file mode 100644 index 000000000..e8de29081 --- /dev/null +++ b/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java @@ -0,0 +1,595 @@ +package com.netscape.pkisilent.http; + +// --- BEGIN COPYRIGHT BLOCK --- +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; version 2 of the License. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +// +// (C) 2007 Red Hat, Inc. +// All rights reserved. +// --- END COPYRIGHT BLOCK --- + +import java.io.File; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.LinkedHashSet; +import java.util.StringTokenizer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class HTMLDocument { + // Indicates whether this HTML document has been parsed. + boolean parsed; + + // A list of URLs of files that should be retrieved along with the main + // contents of the document. This may include any images contained in the + // document, and possibly any external stylesheets. + LinkedHashSet associatedFiles; + + // A list of URLs of frames that are contained in the document. + LinkedHashSet documentFrames; + + // A list of URLs of links that are contained in the document. + LinkedHashSet documentLinks; + + // A list of URLs of images that are contained in the document. + LinkedHashSet documentImages; + + // A regular expression pattern that can be used to extract a URI from an HREF + // tag. + Pattern hrefPattern; + + // A regular expression pattern that can be used to extract a URI from a SRC + // tag. + Pattern srcPattern; + + // The base URL for relative links in this document. + String baseURL; + + // The URL that may be used to access this document. + String documentURL; + + // The actual contents of the page. + String htmlData; + + // The contents of the page converted to lowercase for easier matching. + String lowerData; + + // The URL for this document with only protocol, host, and port (i.e., no + // file). + String protocolHostPort; + + // A string buffer containing the contents of the page with tags removed. + StringBuffer textData; + + // A set of private variables used for internal processing. + private boolean lastElementIsAssociatedFile; + private boolean lastElementIsChunk; + private boolean lastElementIsComment; + private boolean lastElementIsFrame; + private boolean lastElementIsImage; + private boolean lastElementIsLink; + private boolean lastElementIsText; + private int lastElementEndPos; + private int lastElementStartPos; + private String lastURL; + + // constructor that helps to parse without url stuff + public HTMLDocument(String htmlData) { + this.documentURL = null; + this.htmlData = htmlData; + lowerData = htmlData.toLowerCase(); + associatedFiles = null; + documentLinks = null; + documentImages = null; + textData = null; + parsed = false; + + // Create the regex patterns that we will use for extracting URIs from tags. + hrefPattern = Pattern.compile(".*?[hH][rR][eE][fF][\\s=\\\"\\']+" + + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL); + srcPattern = Pattern.compile(".*?[sS][rR][cC][\\s=\\\"\\']+" + + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL); + } + + /** + * Creates a new HTML document using the provided data. + * + * @param documentURL The URL for this document. + * @param htmlData The actual data contained in the HTML document. + */ + public HTMLDocument(String documentURL, String htmlData) + throws MalformedURLException { + this.documentURL = documentURL; + this.htmlData = htmlData; + lowerData = htmlData.toLowerCase(); + associatedFiles = null; + documentLinks = null; + documentImages = null; + textData = null; + parsed = false; + + // Create the regex patterns that we will use for extracting URIs from tags. + hrefPattern = Pattern.compile(".*?[hH][rR][eE][fF][\\s=\\\"\\']+" + + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL); + srcPattern = Pattern.compile(".*?[sS][rR][cC][\\s=\\\"\\']+" + + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL); + + URL url = new URL(documentURL); + String urlPath = url.getPath(); + if ((urlPath == null) || (urlPath.length() == 0)) { + baseURL = documentURL; + protocolHostPort = documentURL; + } else if (urlPath.equals("/")) { + baseURL = documentURL; + protocolHostPort = documentURL.substring(0, documentURL.length() - 1); + } else if (urlPath.endsWith("/")) { + baseURL = documentURL; + + int port = url.getPort(); + if (port > 0) { + protocolHostPort = url.getProtocol() + "://" + url.getHost() + ":" + + port; + } else { + protocolHostPort = url.getProtocol() + "://" + url.getHost(); + } + } else { + int port = url.getPort(); + if (port > 0) { + protocolHostPort = url.getProtocol() + "://" + url.getHost() + ":" + + port; + } else { + protocolHostPort = url.getProtocol() + "://" + url.getHost(); + } + + File urlFile = new File(urlPath); + String parentDirectory = urlFile.getParent(); + if ((parentDirectory == null) || (parentDirectory.length() == 0)) { + parentDirectory = "/"; + } else if (!parentDirectory.startsWith("/")) { + parentDirectory = "/" + parentDirectory; + } + + baseURL = protocolHostPort + parentDirectory; + } + + if (!baseURL.endsWith("/")) { + baseURL = baseURL + "/"; + } + } + + /** + * Actually parses the HTML document and extracts useful elements from it. + * + * @return true if the page could be parsed successfully, or false if not. + */ + public boolean parse() { + if (parsed) { + return true; + } + + try { + associatedFiles = new LinkedHashSet(); + documentFrames = new LinkedHashSet(); + documentLinks = new LinkedHashSet(); + documentImages = new LinkedHashSet(); + textData = new StringBuffer(); + + lastElementStartPos = 0; + lastElementEndPos = -1; + String element; + while ((element = nextDocumentElement()) != null) { + if (element.length() == 0) { + continue; + } + + if (lastElementIsText) { + char lastChar; + if (textData.length() == 0) { + lastChar = ' '; + } else { + lastChar = textData.charAt(textData.length() - 1); + } + char firstChar = element.charAt(0); + if (!((lastChar == ' ') || (lastChar == '\t') || + (lastChar == '\r') || (lastChar == '\n')) || + (firstChar == ' ') || (firstChar == '\t') || + (firstChar == '\r') || (firstChar == '\n')) { + textData.append(" "); + } + + textData.append(element); + } else if (lastElementIsImage) { + if (lastURL != null) { + documentImages.add(lastURL); + associatedFiles.add(lastURL); + } + } else if (lastElementIsFrame) { + if (lastURL != null) { + documentFrames.add(lastURL); + associatedFiles.add(lastURL); + } + } else if (lastElementIsLink) { + if (lastURL != null) { + documentLinks.add(lastURL); + } + } else if (lastElementIsAssociatedFile) { + if (lastURL != null) { + associatedFiles.add(lastURL); + } + } else if (lastElementIsChunk || lastElementIsComment) { + // Don't need to do anything with this. + } else { + // Also don't need anything here. + } + } + + parsed = true; + } catch (Exception e) { + associatedFiles = null; + documentLinks = null; + documentImages = null; + textData = null; + parsed = false; + } + + return parsed; + } + + /** + * Retrieves the next element from the HTML document. An HTML element can + * include a string of plain text, a single HTML tag, or a larger chunk of + * HTML including a start and end tag, all of which should be considered a + * single element. + */ + private String nextDocumentElement() { + // If we're at the end of the HTML, then return null. + if (lastElementEndPos >= htmlData.length()) { + return null; + } + + // Initialize the variables we will use for the search. + lastElementStartPos = lastElementEndPos + 1; + lastElementIsAssociatedFile = false; + lastElementIsChunk = false; + lastElementIsComment = false; + lastElementIsFrame = false; + lastElementIsImage = false; + lastElementIsLink = false; + lastElementIsText = false; + lastURL = null; + + // Find the location of the next open angle bracket. If there is none, then + // the rest of the document must be plain text. + int openPos = lowerData.indexOf('<', lastElementStartPos); + if (openPos < 0) { + lastElementEndPos = htmlData.length(); + lastElementIsText = true; + return htmlData.substring(lastElementStartPos); + } + + // If the location of the next open tag is not we started looking, then read + // everything up to that tag as text. + if (openPos > lastElementStartPos) { + lastElementEndPos = openPos - 1; + lastElementIsText = true; + return htmlData.substring(lastElementStartPos, openPos); + } + + // The start position is an open tag. See if the tag is actually "". + if (openPos == lowerData.indexOf("", openPos + 1); + if (closePos < 0) { + // This looks like an unterminated comment. We can't do much else + // here, so just stop parsing. + return null; + } else { + lastElementEndPos = closePos + 2; + lastElementIsComment = true; + return htmlData.substring(lastElementStartPos, lastElementEndPos + 1); + } + } + + // Find the location of the next close angle bracket. If there is none, + // then we have an unmatched open tag. What to do here? I guess just treat + // the rest of the document as text. + int closePos = lowerData.indexOf('>', openPos + 1); + if (closePos < 0) { + lastElementEndPos = htmlData.length(); + lastElementIsText = true; + return htmlData.substring(lastElementStartPos); + } + + // Grab the contents of the tag in both normal and lowercase. + String tag = htmlData.substring(openPos, closePos + 1); + String strippedTag = htmlData.substring(openPos + 1, closePos).trim(); + StringTokenizer tokenizer = new StringTokenizer(strippedTag, " \t\r\n=\"'"); + lastElementEndPos = closePos; + + if (!tokenizer.hasMoreTokens()) { + return tag; + } + + String token = tokenizer.nextToken(); + String lowerToken = token.toLowerCase(); + + if (lowerToken.equals("a") || lowerToken.equals("area")) { + while (tokenizer.hasMoreTokens()) { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("href")) { + try { + Matcher matcher = hrefPattern.matcher(tag); + lastURL = uriToURL(matcher.replaceAll("$1")); + if (lastURL != null) { + lastElementIsLink = true; + } + } catch (Exception e) { + } + break; + } + } + } else if (lowerToken.equals("base")) { + while (tokenizer.hasMoreTokens()) { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("href")) { + try { + Matcher matcher = hrefPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + if (!uri.endsWith("/")) { + uri = uri + "/"; + } + + baseURL = uri; + } catch (Exception e) { + } + break; + } + } + } else if (lowerToken.equals("frame") || lowerToken.equals("iframe") || + lowerToken.equals("input")) { + while (tokenizer.hasMoreTokens()) { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("src")) { + try { + Matcher matcher = srcPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + lastURL = uriToURL(uri); + if (lastURL != null) { + lastElementIsFrame = true; + lastElementIsAssociatedFile = true; + } + } catch (Exception e) { + } + break; + } + } + } else if (lowerToken.equals("img")) { + while (tokenizer.hasMoreTokens()) { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("src")) { + try { + Matcher matcher = srcPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + lastURL = uriToURL(uri); + if (lastURL != null) { + lastElementIsImage = true; + } + } catch (Exception e) { + } + break; + } + } + } else if (lowerToken.equals("link")) { + boolean isStyleSheet = false; + + while (tokenizer.hasMoreTokens()) { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("href")) { + try { + Matcher matcher = hrefPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + lastURL = uriToURL(uri); + if (lastURL != null) { + lastElementIsLink = true; + } + } catch (Exception e) { + } + break; + } else if (token.equalsIgnoreCase("rel")) { + if (tokenizer.hasMoreTokens()) { + String relType = tokenizer.nextToken(); + if (relType.equalsIgnoreCase("stylesheet")) { + isStyleSheet = true; + } + } + } + } + + if (lastURL != null) { + if (isStyleSheet) { + lastElementIsAssociatedFile = true; + } else { + lastElementIsLink = true; + } + } + } else if (lowerToken.equals("script")) { + while (tokenizer.hasMoreTokens()) { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("src")) { + try { + Matcher matcher = srcPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + lastURL = uriToURL(uri); + } catch (Exception e) { + } + break; + } + } + + if (lastURL == null) { + int endScriptPos = lowerData.indexOf("", lastElementEndPos + 1); + if (endScriptPos > 0) { + lastElementEndPos = endScriptPos + 8; + tag = htmlData.substring(lastElementStartPos, lastElementEndPos + 1); + lastElementIsChunk = true; + } + } else { + lastElementIsAssociatedFile = true; + } + } + + return tag; + } + + /** + * Converts the provided URI to a URL. The provided URI may be a URL already, + * or it may also be an absolute path on the server or a path relative to the + * base URL. + * + * @param uri The URI to convert to a URL. + * + * @return The URL based on the provided URI. + */ + private String uriToURL(String uri) { + String url = null; + + if (uri.indexOf("://") > 0) { + if (uri.startsWith("http")) { + url = uri; + } + } else if (uri.startsWith("/")) { + url = protocolHostPort + uri; + } else { + url = baseURL + uri; + } + + return url; + } + + /** + * Retrieves the URL of this HTML document. + * + * @return The URL of this HTML document. + */ + public String getDocumentURL() { + return documentURL; + } + + /** + * Retrieves the original HTML data used to create this document. + * + * @return The orginal HTML data used to create this document. + */ + public String getHTMLData() { + return htmlData; + } + + /** + * Retrieves the contents of the HTML document with all tags removed. + * + * @return The contents of the HTML document with all tags removed, or null if a problem occurs while + * trying to parse the + * HTML. + */ + public String getTextData() { + if (!parsed) { + if (!parse()) { + return null; + } + } + + return textData.toString(); + } + + /** + * Retrieves an array containing a set of URLs parsed from the HTML document + * that reference files that would normally be downloaded as part of + * retrieving a page in a browser. This includes images and external style + * sheets. + * + * @return An array containing a set of URLs to files associated with the + * HTML document, or null if a problem occurs while + * trying to parse the HTML. + */ + public String[] getAssociatedFiles() { + if (!parsed) { + if (!parse()) { + return null; + } + } + + String[] urlArray = new String[associatedFiles.size()]; + associatedFiles.toArray(urlArray); + return urlArray; + } + + /** + * Retrieves an array containing a set of URLs parsed from the HTML document + * that are in the form of links to other content. + * + * @return An array containing a set of URLs parsed from the HTML document + * that are in the form of links to other content, or null if a problem occurs while trying to + * parse the + * HTML. + */ + public String[] getDocumentLinks() { + if (!parsed) { + if (!parse()) { + return null; + } + } + + String[] urlArray = new String[documentLinks.size()]; + documentLinks.toArray(urlArray); + return urlArray; + } + + /** + * Retrieves an array containing a set of URLs parsed from the HTML document + * that reference images used in the document. + * + * @return An array containing a set of URLs parsed from the HTML document + * that reference images used in the document. + */ + public String[] getDocumentImages() { + if (!parsed) { + if (!parse()) { + return null; + } + } + + String[] urlArray = new String[documentImages.size()]; + documentImages.toArray(urlArray); + return urlArray; + } + + /** + * Retrieves an array containing a set of URLs parsed from the HTML document + * that reference frames used in the document. + * + * @return An array containing a set of URLs parsed from the HTML document + * that reference frames used in the document. + */ + public String[] getDocumentFrames() { + if (!parsed) { + if (!parse()) { + return null; + } + } + + String[] urlArray = new String[documentFrames.size()]; + documentFrames.toArray(urlArray); + return urlArray; + } +} -- cgit