diff options
Diffstat (limited to 'pki/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java')
-rw-r--r-- | pki/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java | 1182 |
1 files changed, 512 insertions, 670 deletions
diff --git a/pki/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java b/pki/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java index df95f8611..67e95cf5d 100644 --- a/pki/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java +++ b/pki/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java @@ -1,4 +1,5 @@ package com.netscape.pkisilent.http; + // --- BEGIN COPYRIGHT BLOCK --- // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -25,727 +26,568 @@ import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; - - -public class HTMLDocument -{ - // Indicates whether this HTML document has been parsed. - boolean parsed; - - // A list of URLs of files that should be retrieved along with the main - // contents of the document. This may include any images contained in the - // document, and possibly any external stylesheets. - LinkedHashSet<String> associatedFiles; - - // A list of URLs of frames that are contained in the document. - LinkedHashSet<String> documentFrames; - - // A list of URLs of links that are contained in the document. - LinkedHashSet<String> documentLinks; - - // A list of URLs of images that are contained in the document. - LinkedHashSet<String> documentImages; - - // A regular expression pattern that can be used to extract a URI from an HREF - // tag. - Pattern hrefPattern; - - // A regular expression pattern that can be used to extract a URI from a SRC - // tag. - Pattern srcPattern; - - // The base URL for relative links in this document. - String baseURL; - - // The URL that may be used to access this document. - String documentURL; - - // The actual contents of the page. - String htmlData; - - // The contents of the page converted to lowercase for easier matching. - String lowerData; - - // The URL for this document with only protocol, host, and port (i.e., no - // file). - String protocolHostPort; - - // A string buffer containing the contents of the page with tags removed. - StringBuffer textData; - - - // A set of private variables used for internal processing. - private boolean lastElementIsAssociatedFile; - private boolean lastElementIsChunk; - private boolean lastElementIsComment; - private boolean lastElementIsFrame; - private boolean lastElementIsImage; - private boolean lastElementIsLink; - private boolean lastElementIsText; - private int lastElementEndPos; - private int lastElementStartPos; - private String lastURL; - - // constructor that helps to parse without url stuff - public HTMLDocument(String htmlData) - { - this.documentURL = null; - this.htmlData = htmlData; - lowerData = htmlData.toLowerCase(); - associatedFiles = null; - documentLinks = null; - documentImages = null; - textData = null; - parsed = false; - - - // Create the regex patterns that we will use for extracting URIs from tags. - hrefPattern = Pattern.compile(".*?[hH][rR][eE][fF][\\s=\\\"\\']+" + +public class HTMLDocument { + // Indicates whether this HTML document has been parsed. + boolean parsed; + + // A list of URLs of files that should be retrieved along with the main + // contents of the document. This may include any images contained in the + // document, and possibly any external stylesheets. + LinkedHashSet<String> associatedFiles; + + // A list of URLs of frames that are contained in the document. + LinkedHashSet<String> documentFrames; + + // A list of URLs of links that are contained in the document. + LinkedHashSet<String> documentLinks; + + // A list of URLs of images that are contained in the document. + LinkedHashSet<String> documentImages; + + // A regular expression pattern that can be used to extract a URI from an HREF + // tag. + Pattern hrefPattern; + + // A regular expression pattern that can be used to extract a URI from a SRC + // tag. + Pattern srcPattern; + + // The base URL for relative links in this document. + String baseURL; + + // The URL that may be used to access this document. + String documentURL; + + // The actual contents of the page. + String htmlData; + + // The contents of the page converted to lowercase for easier matching. + String lowerData; + + // The URL for this document with only protocol, host, and port (i.e., no + // file). + String protocolHostPort; + + // A string buffer containing the contents of the page with tags removed. + StringBuffer textData; + + // A set of private variables used for internal processing. + private boolean lastElementIsAssociatedFile; + private boolean lastElementIsChunk; + private boolean lastElementIsComment; + private boolean lastElementIsFrame; + private boolean lastElementIsImage; + private boolean lastElementIsLink; + private boolean lastElementIsText; + private int lastElementEndPos; + private int lastElementStartPos; + private String lastURL; + + // constructor that helps to parse without url stuff + public HTMLDocument(String htmlData) { + this.documentURL = null; + this.htmlData = htmlData; + lowerData = htmlData.toLowerCase(); + associatedFiles = null; + documentLinks = null; + documentImages = null; + textData = null; + parsed = false; + + // Create the regex patterns that we will use for extracting URIs from tags. + hrefPattern = Pattern.compile(".*?[hH][rR][eE][fF][\\s=\\\"\\']+" + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL); - srcPattern = Pattern.compile(".*?[sS][rR][cC][\\s=\\\"\\']+" + + srcPattern = Pattern.compile(".*?[sS][rR][cC][\\s=\\\"\\']+" + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL); - } - - - /** - * Creates a new HTML document using the provided data. - * - * @param documentURL The URL for this document. - * @param htmlData The actual data contained in the HTML document. - */ - public HTMLDocument(String documentURL, String htmlData) - throws MalformedURLException - { - this.documentURL = documentURL; - this.htmlData = htmlData; - lowerData = htmlData.toLowerCase(); - associatedFiles = null; - documentLinks = null; - documentImages = null; - textData = null; - parsed = false; - - - // Create the regex patterns that we will use for extracting URIs from tags. - hrefPattern = Pattern.compile(".*?[hH][rR][eE][fF][\\s=\\\"\\']+" + + } + + /** + * Creates a new HTML document using the provided data. + * + * @param documentURL The URL for this document. + * @param htmlData The actual data contained in the HTML document. + */ + public HTMLDocument(String documentURL, String htmlData) + throws MalformedURLException { + this.documentURL = documentURL; + this.htmlData = htmlData; + lowerData = htmlData.toLowerCase(); + associatedFiles = null; + documentLinks = null; + documentImages = null; + textData = null; + parsed = false; + + // Create the regex patterns that we will use for extracting URIs from tags. + hrefPattern = Pattern.compile(".*?[hH][rR][eE][fF][\\s=\\\"\\']+" + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL); - srcPattern = Pattern.compile(".*?[sS][rR][cC][\\s=\\\"\\']+" + + srcPattern = Pattern.compile(".*?[sS][rR][cC][\\s=\\\"\\']+" + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL); - URL url = new URL(documentURL); - String urlPath = url.getPath(); - if ((urlPath == null) || (urlPath.length() == 0)) - { - baseURL = documentURL; - protocolHostPort = documentURL; - } - else if (urlPath.equals("/")) - { - baseURL = documentURL; - protocolHostPort = documentURL.substring(0, documentURL.length()-1); - } - else if (urlPath.endsWith("/")) - { - baseURL = documentURL; - - int port = url.getPort(); - if (port > 0) - { - protocolHostPort = url.getProtocol() + "://" + url.getHost() + ":" + + URL url = new URL(documentURL); + String urlPath = url.getPath(); + if ((urlPath == null) || (urlPath.length() == 0)) { + baseURL = documentURL; + protocolHostPort = documentURL; + } else if (urlPath.equals("/")) { + baseURL = documentURL; + protocolHostPort = documentURL.substring(0, documentURL.length() - 1); + } else if (urlPath.endsWith("/")) { + baseURL = documentURL; + + int port = url.getPort(); + if (port > 0) { + protocolHostPort = url.getProtocol() + "://" + url.getHost() + ":" + port; - } - else - { - protocolHostPort = url.getProtocol() + "://" + url.getHost(); - } - } - else - { - int port = url.getPort(); - if (port > 0) - { - protocolHostPort = url.getProtocol() + "://" + url.getHost() + ":" + + } else { + protocolHostPort = url.getProtocol() + "://" + url.getHost(); + } + } else { + int port = url.getPort(); + if (port > 0) { + protocolHostPort = url.getProtocol() + "://" + url.getHost() + ":" + port; - } - else - { - protocolHostPort = url.getProtocol() + "://" + url.getHost(); - } - - File urlFile = new File(urlPath); - String parentDirectory = urlFile.getParent(); - if ((parentDirectory == null) || (parentDirectory.length() == 0)) - { - parentDirectory = "/"; - } - else if (! parentDirectory.startsWith("/")) - { - parentDirectory = "/" + parentDirectory; - } - - baseURL = protocolHostPort + parentDirectory; - } - - if (! baseURL.endsWith("/")) - { - baseURL = baseURL + "/"; - } - } - - - - /** - * Actually parses the HTML document and extracts useful elements from it. - * - * @return <CODE>true</CODE> if the page could be parsed successfully, or - * <CODE>false</CODE> if not. - */ - public boolean parse() - { - if (parsed) - { - return true; - } + } else { + protocolHostPort = url.getProtocol() + "://" + url.getHost(); + } + File urlFile = new File(urlPath); + String parentDirectory = urlFile.getParent(); + if ((parentDirectory == null) || (parentDirectory.length() == 0)) { + parentDirectory = "/"; + } else if (!parentDirectory.startsWith("/")) { + parentDirectory = "/" + parentDirectory; + } - try - { - associatedFiles = new LinkedHashSet<String>(); - documentFrames = new LinkedHashSet<String>(); - documentLinks = new LinkedHashSet<String>(); - documentImages = new LinkedHashSet<String>(); - textData = new StringBuffer(); - - lastElementStartPos = 0; - lastElementEndPos = -1; - String element; - while ((element = nextDocumentElement()) != null) - { - if (element.length() == 0) - { - continue; + baseURL = protocolHostPort + parentDirectory; } - if (lastElementIsText) - { - char lastChar; - if (textData.length() == 0) - { - lastChar = ' '; - } - else - { - lastChar = textData.charAt(textData.length()-1); - } - char firstChar = element.charAt(0); - if (! ((lastChar == ' ') || (lastChar == '\t') || - (lastChar == '\r') || (lastChar == '\n')) || - (firstChar == ' ') || (firstChar == '\t') || - (firstChar == '\r') || (firstChar == '\n')) - { - textData.append(" "); - } - - textData.append(element); - } - else if (lastElementIsImage) - { - if (lastURL != null) - { - documentImages.add(lastURL); - associatedFiles.add(lastURL); - } + if (!baseURL.endsWith("/")) { + baseURL = baseURL + "/"; } - else if (lastElementIsFrame) - { - if (lastURL != null) - { - documentFrames.add(lastURL); - associatedFiles.add(lastURL); - } - } - else if (lastElementIsLink) - { - if (lastURL != null) - { - documentLinks.add(lastURL); - } - } - else if (lastElementIsAssociatedFile) - { - if (lastURL != null) - { - associatedFiles.add(lastURL); - } - } - else if (lastElementIsChunk || lastElementIsComment) - { - // Don't need to do anything with this. - } - else - { - // Also don't need anything here. - } - } - - parsed = true; - } - catch (Exception e) - { - associatedFiles = null; - documentLinks = null; - documentImages = null; - textData = null; - parsed = false; } - return parsed; - } + /** + * Actually parses the HTML document and extracts useful elements from it. + * + * @return <CODE>true</CODE> if the page could be parsed successfully, or <CODE>false</CODE> if not. + */ + public boolean parse() { + if (parsed) { + return true; + } + try { + associatedFiles = new LinkedHashSet<String>(); + documentFrames = new LinkedHashSet<String>(); + documentLinks = new LinkedHashSet<String>(); + documentImages = new LinkedHashSet<String>(); + textData = new StringBuffer(); + + lastElementStartPos = 0; + lastElementEndPos = -1; + String element; + while ((element = nextDocumentElement()) != null) { + if (element.length() == 0) { + continue; + } + + if (lastElementIsText) { + char lastChar; + if (textData.length() == 0) { + lastChar = ' '; + } else { + lastChar = textData.charAt(textData.length() - 1); + } + char firstChar = element.charAt(0); + if (!((lastChar == ' ') || (lastChar == '\t') || + (lastChar == '\r') || (lastChar == '\n')) || + (firstChar == ' ') || (firstChar == '\t') || + (firstChar == '\r') || (firstChar == '\n')) { + textData.append(" "); + } + + textData.append(element); + } else if (lastElementIsImage) { + if (lastURL != null) { + documentImages.add(lastURL); + associatedFiles.add(lastURL); + } + } else if (lastElementIsFrame) { + if (lastURL != null) { + documentFrames.add(lastURL); + associatedFiles.add(lastURL); + } + } else if (lastElementIsLink) { + if (lastURL != null) { + documentLinks.add(lastURL); + } + } else if (lastElementIsAssociatedFile) { + if (lastURL != null) { + associatedFiles.add(lastURL); + } + } else if (lastElementIsChunk || lastElementIsComment) { + // Don't need to do anything with this. + } else { + // Also don't need anything here. + } + } + parsed = true; + } catch (Exception e) { + associatedFiles = null; + documentLinks = null; + documentImages = null; + textData = null; + parsed = false; + } - /** - * Retrieves the next element from the HTML document. An HTML element can - * include a string of plain text, a single HTML tag, or a larger chunk of - * HTML including a start and end tag, all of which should be considered a - * single element. - */ - private String nextDocumentElement() - { - // If we're at the end of the HTML, then return null. - if (lastElementEndPos >= htmlData.length()) - { - return null; + return parsed; } + /** + * Retrieves the next element from the HTML document. An HTML element can + * include a string of plain text, a single HTML tag, or a larger chunk of + * HTML including a start and end tag, all of which should be considered a + * single element. + */ + private String nextDocumentElement() { + // If we're at the end of the HTML, then return null. + if (lastElementEndPos >= htmlData.length()) { + return null; + } - // Initialize the variables we will use for the search. - lastElementStartPos = lastElementEndPos+1; - lastElementIsAssociatedFile = false; - lastElementIsChunk = false; - lastElementIsComment = false; - lastElementIsFrame = false; - lastElementIsImage = false; - lastElementIsLink = false; - lastElementIsText = false; - lastURL = null; - - - // Find the location of the next open angle bracket. If there is none, then - // the rest of the document must be plain text. - int openPos = lowerData.indexOf('<', lastElementStartPos); - if (openPos < 0) - { - lastElementEndPos = htmlData.length(); - lastElementIsText = true; - return htmlData.substring(lastElementStartPos); - } + // Initialize the variables we will use for the search. + lastElementStartPos = lastElementEndPos + 1; + lastElementIsAssociatedFile = false; + lastElementIsChunk = false; + lastElementIsComment = false; + lastElementIsFrame = false; + lastElementIsImage = false; + lastElementIsLink = false; + lastElementIsText = false; + lastURL = null; + + // Find the location of the next open angle bracket. If there is none, then + // the rest of the document must be plain text. + int openPos = lowerData.indexOf('<', lastElementStartPos); + if (openPos < 0) { + lastElementEndPos = htmlData.length(); + lastElementIsText = true; + return htmlData.substring(lastElementStartPos); + } + // If the location of the next open tag is not we started looking, then read + // everything up to that tag as text. + if (openPos > lastElementStartPos) { + lastElementEndPos = openPos - 1; + lastElementIsText = true; + return htmlData.substring(lastElementStartPos, openPos); + } - // If the location of the next open tag is not we started looking, then read - // everything up to that tag as text. - if (openPos > lastElementStartPos) - { - lastElementEndPos = openPos-1; - lastElementIsText = true; - return htmlData.substring(lastElementStartPos, openPos); - } + // The start position is an open tag. See if the tag is actually "<!--", + // which indicates an HTML comment. If that's the case, then find the + // closing "-->". + if (openPos == lowerData.indexOf("<!--", lastElementStartPos)) { + int closePos = lowerData.indexOf("-->", openPos + 1); + if (closePos < 0) { + // This looks like an unterminated comment. We can't do much else + // here, so just stop parsing. + return null; + } else { + lastElementEndPos = closePos + 2; + lastElementIsComment = true; + return htmlData.substring(lastElementStartPos, lastElementEndPos + 1); + } + } + // Find the location of the next close angle bracket. If there is none, + // then we have an unmatched open tag. What to do here? I guess just treat + // the rest of the document as text. + int closePos = lowerData.indexOf('>', openPos + 1); + if (closePos < 0) { + lastElementEndPos = htmlData.length(); + lastElementIsText = true; + return htmlData.substring(lastElementStartPos); + } - // The start position is an open tag. See if the tag is actually "<!--", - // which indicates an HTML comment. If that's the case, then find the - // closing "-->". - if (openPos == lowerData.indexOf("<!--", lastElementStartPos)) - { - int closePos = lowerData.indexOf("-->", openPos+1); - if (closePos < 0) - { - // This looks like an unterminated comment. We can't do much else - // here, so just stop parsing. - return null; - } - else - { - lastElementEndPos = closePos + 2; - lastElementIsComment = true; - return htmlData.substring(lastElementStartPos, lastElementEndPos+1); - } - } + // Grab the contents of the tag in both normal and lowercase. + String tag = htmlData.substring(openPos, closePos + 1); + String strippedTag = htmlData.substring(openPos + 1, closePos).trim(); + StringTokenizer tokenizer = new StringTokenizer(strippedTag, " \t\r\n=\"'"); + lastElementEndPos = closePos; + if (!tokenizer.hasMoreTokens()) { + return tag; + } - // Find the location of the next close angle bracket. If there is none, - // then we have an unmatched open tag. What to do here? I guess just treat - // the rest of the document as text. - int closePos = lowerData.indexOf('>', openPos+1); - if (closePos < 0) - { - lastElementEndPos = htmlData.length(); - lastElementIsText = true; - return htmlData.substring(lastElementStartPos); - } + String token = tokenizer.nextToken(); + String lowerToken = token.toLowerCase(); + + if (lowerToken.equals("a") || lowerToken.equals("area")) { + while (tokenizer.hasMoreTokens()) { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("href")) { + try { + Matcher matcher = hrefPattern.matcher(tag); + lastURL = uriToURL(matcher.replaceAll("$1")); + if (lastURL != null) { + lastElementIsLink = true; + } + } catch (Exception e) { + } + break; + } + } + } else if (lowerToken.equals("base")) { + while (tokenizer.hasMoreTokens()) { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("href")) { + try { + Matcher matcher = hrefPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + if (!uri.endsWith("/")) { + uri = uri + "/"; + } + + baseURL = uri; + } catch (Exception e) { + } + break; + } + } + } else if (lowerToken.equals("frame") || lowerToken.equals("iframe") || + lowerToken.equals("input")) { + while (tokenizer.hasMoreTokens()) { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("src")) { + try { + Matcher matcher = srcPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + lastURL = uriToURL(uri); + if (lastURL != null) { + lastElementIsFrame = true; + lastElementIsAssociatedFile = true; + } + } catch (Exception e) { + } + break; + } + } + } else if (lowerToken.equals("img")) { + while (tokenizer.hasMoreTokens()) { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("src")) { + try { + Matcher matcher = srcPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + lastURL = uriToURL(uri); + if (lastURL != null) { + lastElementIsImage = true; + } + } catch (Exception e) { + } + break; + } + } + } else if (lowerToken.equals("link")) { + boolean isStyleSheet = false; + + while (tokenizer.hasMoreTokens()) { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("href")) { + try { + Matcher matcher = hrefPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + lastURL = uriToURL(uri); + if (lastURL != null) { + lastElementIsLink = true; + } + } catch (Exception e) { + } + break; + } else if (token.equalsIgnoreCase("rel")) { + if (tokenizer.hasMoreTokens()) { + String relType = tokenizer.nextToken(); + if (relType.equalsIgnoreCase("stylesheet")) { + isStyleSheet = true; + } + } + } + } + if (lastURL != null) { + if (isStyleSheet) { + lastElementIsAssociatedFile = true; + } else { + lastElementIsLink = true; + } + } + } else if (lowerToken.equals("script")) { + while (tokenizer.hasMoreTokens()) { + token = tokenizer.nextToken(); + if (token.equalsIgnoreCase("src")) { + try { + Matcher matcher = srcPattern.matcher(tag); + String uri = matcher.replaceAll("$1"); + lastURL = uriToURL(uri); + } catch (Exception e) { + } + break; + } + } - // Grab the contents of the tag in both normal and lowercase. - String tag = htmlData.substring(openPos, closePos+1); - String strippedTag = htmlData.substring(openPos+1, closePos).trim(); - StringTokenizer tokenizer = new StringTokenizer(strippedTag, " \t\r\n=\"'"); - lastElementEndPos = closePos; + if (lastURL == null) { + int endScriptPos = lowerData.indexOf("</script>", lastElementEndPos + 1); + if (endScriptPos > 0) { + lastElementEndPos = endScriptPos + 8; + tag = htmlData.substring(lastElementStartPos, lastElementEndPos + 1); + lastElementIsChunk = true; + } + } else { + lastElementIsAssociatedFile = true; + } + } - if (! tokenizer.hasMoreTokens()) - { - return tag; + return tag; } - String token = tokenizer.nextToken(); - String lowerToken = token.toLowerCase(); - - if (lowerToken.equals("a") || lowerToken.equals("area")) - { - while (tokenizer.hasMoreTokens()) - { - token = tokenizer.nextToken(); - if (token.equalsIgnoreCase("href")) - { - try - { - Matcher matcher = hrefPattern.matcher(tag); - lastURL = uriToURL(matcher.replaceAll("$1")); - if (lastURL != null) - { - lastElementIsLink = true; + /** + * Converts the provided URI to a URL. The provided URI may be a URL already, + * or it may also be an absolute path on the server or a path relative to the + * base URL. + * + * @param uri The URI to convert to a URL. + * + * @return The URL based on the provided URI. + */ + private String uriToURL(String uri) { + String url = null; + + if (uri.indexOf("://") > 0) { + if (uri.startsWith("http")) { + url = uri; } - } catch (Exception e) {} - break; + } else if (uri.startsWith("/")) { + url = protocolHostPort + uri; + } else { + url = baseURL + uri; } - } + + return url; } - else if (lowerToken.equals("base")) - { - while (tokenizer.hasMoreTokens()) - { - token = tokenizer.nextToken(); - if (token.equalsIgnoreCase("href")) - { - try - { - Matcher matcher = hrefPattern.matcher(tag); - String uri = matcher.replaceAll("$1"); - if (! uri.endsWith("/")) - { - uri = uri + "/"; - } - baseURL = uri; - } catch (Exception e) {} - break; - } - } + /** + * Retrieves the URL of this HTML document. + * + * @return The URL of this HTML document. + */ + public String getDocumentURL() { + return documentURL; } - else if (lowerToken.equals("frame") || lowerToken.equals("iframe") || - lowerToken.equals("input")) - { - while (tokenizer.hasMoreTokens()) - { - token = tokenizer.nextToken(); - if (token.equalsIgnoreCase("src")) - { - try - { - Matcher matcher = srcPattern.matcher(tag); - String uri = matcher.replaceAll("$1"); - lastURL = uriToURL(uri); - if (lastURL != null) - { - lastElementIsFrame = true; - lastElementIsAssociatedFile = true; - } - } catch (Exception e) {} - break; - } - } + + /** + * Retrieves the original HTML data used to create this document. + * + * @return The orginal HTML data used to create this document. + */ + public String getHTMLData() { + return htmlData; } - else if (lowerToken.equals("img")) - { - while (tokenizer.hasMoreTokens()) - { - token = tokenizer.nextToken(); - if (token.equalsIgnoreCase("src")) - { - try - { - Matcher matcher = srcPattern.matcher(tag); - String uri = matcher.replaceAll("$1"); - lastURL = uriToURL(uri); - if (lastURL != null) - { - lastElementIsImage = true; + + /** + * Retrieves the contents of the HTML document with all tags removed. + * + * @return The contents of the HTML document with all tags removed, or <CODE>null</CODE> if a problem occurs while trying to parse the + * HTML. + */ + public String getTextData() { + if (!parsed) { + if (!parse()) { + return null; } - } catch (Exception e) {} - break; } - } + + return textData.toString(); } - else if (lowerToken.equals("link")) - { - boolean isStyleSheet = false; - - while (tokenizer.hasMoreTokens()) - { - token = tokenizer.nextToken(); - if (token.equalsIgnoreCase("href")) - { - try - { - Matcher matcher = hrefPattern.matcher(tag); - String uri = matcher.replaceAll("$1"); - lastURL = uriToURL(uri); - if (lastURL != null) - { - lastElementIsLink = true; - } - } catch (Exception e) {} - break; - } - else if (token.equalsIgnoreCase("rel")) - { - if (tokenizer.hasMoreTokens()) - { - String relType = tokenizer.nextToken(); - if (relType.equalsIgnoreCase("stylesheet")) - { - isStyleSheet = true; + + /** + * Retrieves an array containing a set of URLs parsed from the HTML document + * that reference files that would normally be downloaded as part of + * retrieving a page in a browser. This includes images and external style + * sheets. + * + * @return An array containing a set of URLs to files associated with the + * HTML document, or <CODE>null</CODE> if a problem occurs while + * trying to parse the HTML. + */ + public String[] getAssociatedFiles() { + if (!parsed) { + if (!parse()) { + return null; } - } } - } - if (lastURL != null) - { - if (isStyleSheet) - { - lastElementIsAssociatedFile = true; - } - else - { - lastElementIsLink = true; - } - } - } - else if (lowerToken.equals("script")) - { - while (tokenizer.hasMoreTokens()) - { - token = tokenizer.nextToken(); - if (token.equalsIgnoreCase("src")) - { - try - { - Matcher matcher = srcPattern.matcher(tag); - String uri = matcher.replaceAll("$1"); - lastURL = uriToURL(uri); - } catch (Exception e) {} - break; - } - } - - if (lastURL == null) - { - int endScriptPos = lowerData.indexOf("</script>", lastElementEndPos+1); - if (endScriptPos > 0) - { - lastElementEndPos = endScriptPos + 8; - tag = htmlData.substring(lastElementStartPos, lastElementEndPos+1); - lastElementIsChunk = true; - } - } - else - { - lastElementIsAssociatedFile = true; - } + String[] urlArray = new String[associatedFiles.size()]; + associatedFiles.toArray(urlArray); + return urlArray; } - return tag; - } - - - - /** - * Converts the provided URI to a URL. The provided URI may be a URL already, - * or it may also be an absolute path on the server or a path relative to the - * base URL. - * - * @param uri The URI to convert to a URL. - * - * @return The URL based on the provided URI. - */ - private String uriToURL(String uri) - { - String url = null; - - if (uri.indexOf("://") > 0) - { - if (uri.startsWith("http")) - { - url = uri; - } - } - else if (uri.startsWith("/")) - { - url = protocolHostPort + uri; - } - else - { - url = baseURL + uri; - } + /** + * Retrieves an array containing a set of URLs parsed from the HTML document + * that are in the form of links to other content. + * + * @return An array containing a set of URLs parsed from the HTML document + * that are in the form of links to other content, or <CODE>null</CODE> if a problem occurs while trying to parse the + * HTML. + */ + public String[] getDocumentLinks() { + if (!parsed) { + if (!parse()) { + return null; + } + } - return url; - } - - - - /** - * Retrieves the URL of this HTML document. - * - * @return The URL of this HTML document. - */ - public String getDocumentURL() - { - return documentURL; - } - - - - /** - * Retrieves the original HTML data used to create this document. - * - * @return The orginal HTML data used to create this document. - */ - public String getHTMLData() - { - return htmlData; - } - - - - /** - * Retrieves the contents of the HTML document with all tags removed. - * - * @return The contents of the HTML document with all tags removed, or - * <CODE>null</CODE> if a problem occurs while trying to parse the - * HTML. - */ - public String getTextData() - { - if (! parsed) - { - if (! parse()) - { - return null; - } + String[] urlArray = new String[documentLinks.size()]; + documentLinks.toArray(urlArray); + return urlArray; } - return textData.toString(); - } - - - - /** - * Retrieves an array containing a set of URLs parsed from the HTML document - * that reference files that would normally be downloaded as part of - * retrieving a page in a browser. This includes images and external style - * sheets. - * - * @return An array containing a set of URLs to files associated with the - * HTML document, or <CODE>null</CODE> if a problem occurs while - * trying to parse the HTML. - */ - public String[] getAssociatedFiles() - { - if (! parsed) - { - if (! parse()) - { - return null; - } - } + /** + * Retrieves an array containing a set of URLs parsed from the HTML document + * that reference images used in the document. + * + * @return An array containing a set of URLs parsed from the HTML document + * that reference images used in the document. + */ + public String[] getDocumentImages() { + if (!parsed) { + if (!parse()) { + return null; + } + } - String[] urlArray = new String[associatedFiles.size()]; - associatedFiles.toArray(urlArray); - return urlArray; - } - - - - /** - * Retrieves an array containing a set of URLs parsed from the HTML document - * that are in the form of links to other content. - * - * @return An array containing a set of URLs parsed from the HTML document - * that are in the form of links to other content, or - * <CODE>null</CODE> if a problem occurs while trying to parse the - * HTML. - */ - public String[] getDocumentLinks() - { - if (! parsed) - { - if (! parse()) - { - return null; - } + String[] urlArray = new String[documentImages.size()]; + documentImages.toArray(urlArray); + return urlArray; } - String[] urlArray = new String[documentLinks.size()]; - documentLinks.toArray(urlArray); - return urlArray; - } - - - - /** - * Retrieves an array containing a set of URLs parsed from the HTML document - * that reference images used in the document. - * - * @return An array containing a set of URLs parsed from the HTML document - * that reference images used in the document. - */ - public String[] getDocumentImages() - { - if (! parsed) - { - if (! parse()) - { - return null; - } - } + /** + * Retrieves an array containing a set of URLs parsed from the HTML document + * that reference frames used in the document. + * + * @return An array containing a set of URLs parsed from the HTML document + * that reference frames used in the document. + */ + public String[] getDocumentFrames() { + if (!parsed) { + if (!parse()) { + return null; + } + } - String[] urlArray = new String[documentImages.size()]; - documentImages.toArray(urlArray); - return urlArray; - } - - - - /** - * Retrieves an array containing a set of URLs parsed from the HTML document - * that reference frames used in the document. - * - * @return An array containing a set of URLs parsed from the HTML document - * that reference frames used in the document. - */ - public String[] getDocumentFrames() - { - if (! parsed) - { - if (! parse()) - { - return null; - } + String[] urlArray = new String[documentFrames.size()]; + documentFrames.toArray(urlArray); + return urlArray; } - - String[] urlArray = new String[documentFrames.size()]; - documentFrames.toArray(urlArray); - return urlArray; - } } - |