summaryrefslogtreecommitdiffstats
path: root/pki/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java
diff options
context:
space:
mode:
Diffstat (limited to 'pki/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java')
-rw-r--r--pki/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java1208
1 files changed, 533 insertions, 675 deletions
diff --git a/pki/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java b/pki/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java
index df95f861..5c655649 100644
--- a/pki/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java
+++ b/pki/base/silent/src/com/netscape/pkisilent/http/HTMLDocument.java
@@ -1,4 +1,5 @@
package com.netscape.pkisilent.http;
+
// --- BEGIN COPYRIGHT BLOCK ---
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@@ -25,727 +26,584 @@ import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-
-
-public class HTMLDocument
-{
- // Indicates whether this HTML document has been parsed.
- boolean parsed;
-
- // A list of URLs of files that should be retrieved along with the main
- // contents of the document. This may include any images contained in the
- // document, and possibly any external stylesheets.
- LinkedHashSet<String> associatedFiles;
-
- // A list of URLs of frames that are contained in the document.
- LinkedHashSet<String> documentFrames;
-
- // A list of URLs of links that are contained in the document.
- LinkedHashSet<String> documentLinks;
-
- // A list of URLs of images that are contained in the document.
- LinkedHashSet<String> documentImages;
-
- // A regular expression pattern that can be used to extract a URI from an HREF
- // tag.
- Pattern hrefPattern;
-
- // A regular expression pattern that can be used to extract a URI from a SRC
- // tag.
- Pattern srcPattern;
-
- // The base URL for relative links in this document.
- String baseURL;
-
- // The URL that may be used to access this document.
- String documentURL;
-
- // The actual contents of the page.
- String htmlData;
-
- // The contents of the page converted to lowercase for easier matching.
- String lowerData;
-
- // The URL for this document with only protocol, host, and port (i.e., no
- // file).
- String protocolHostPort;
-
- // A string buffer containing the contents of the page with tags removed.
- StringBuffer textData;
-
-
- // A set of private variables used for internal processing.
- private boolean lastElementIsAssociatedFile;
- private boolean lastElementIsChunk;
- private boolean lastElementIsComment;
- private boolean lastElementIsFrame;
- private boolean lastElementIsImage;
- private boolean lastElementIsLink;
- private boolean lastElementIsText;
- private int lastElementEndPos;
- private int lastElementStartPos;
- private String lastURL;
-
- // constructor that helps to parse without url stuff
- public HTMLDocument(String htmlData)
- {
- this.documentURL = null;
- this.htmlData = htmlData;
- lowerData = htmlData.toLowerCase();
- associatedFiles = null;
- documentLinks = null;
- documentImages = null;
- textData = null;
- parsed = false;
-
-
- // Create the regex patterns that we will use for extracting URIs from tags.
- hrefPattern = Pattern.compile(".*?[hH][rR][eE][fF][\\s=\\\"\\']+" +
- "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL);
- srcPattern = Pattern.compile(".*?[sS][rR][cC][\\s=\\\"\\']+" +
- "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL);
- }
-
-
- /**
- * Creates a new HTML document using the provided data.
- *
- * @param documentURL The URL for this document.
- * @param htmlData The actual data contained in the HTML document.
- */
- public HTMLDocument(String documentURL, String htmlData)
- throws MalformedURLException
- {
- this.documentURL = documentURL;
- this.htmlData = htmlData;
- lowerData = htmlData.toLowerCase();
- associatedFiles = null;
- documentLinks = null;
- documentImages = null;
- textData = null;
- parsed = false;
-
-
- // Create the regex patterns that we will use for extracting URIs from tags.
- hrefPattern = Pattern.compile(".*?[hH][rR][eE][fF][\\s=\\\"\\']+" +
- "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL);
- srcPattern = Pattern.compile(".*?[sS][rR][cC][\\s=\\\"\\']+" +
- "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL);
-
- URL url = new URL(documentURL);
- String urlPath = url.getPath();
- if ((urlPath == null) || (urlPath.length() == 0))
- {
- baseURL = documentURL;
- protocolHostPort = documentURL;
- }
- else if (urlPath.equals("/"))
- {
- baseURL = documentURL;
- protocolHostPort = documentURL.substring(0, documentURL.length()-1);
- }
- else if (urlPath.endsWith("/"))
- {
- baseURL = documentURL;
-
- int port = url.getPort();
- if (port > 0)
- {
- protocolHostPort = url.getProtocol() + "://" + url.getHost() + ":" +
- port;
- }
- else
- {
- protocolHostPort = url.getProtocol() + "://" + url.getHost();
- }
- }
- else
- {
- int port = url.getPort();
- if (port > 0)
- {
- protocolHostPort = url.getProtocol() + "://" + url.getHost() + ":" +
- port;
- }
- else
- {
- protocolHostPort = url.getProtocol() + "://" + url.getHost();
- }
-
- File urlFile = new File(urlPath);
- String parentDirectory = urlFile.getParent();
- if ((parentDirectory == null) || (parentDirectory.length() == 0))
- {
- parentDirectory = "/";
- }
- else if (! parentDirectory.startsWith("/"))
- {
- parentDirectory = "/" + parentDirectory;
- }
-
- baseURL = protocolHostPort + parentDirectory;
- }
-
- if (! baseURL.endsWith("/"))
- {
- baseURL = baseURL + "/";
+public class HTMLDocument {
+ // Indicates whether this HTML document has been parsed.
+ boolean parsed;
+
+ // A list of URLs of files that should be retrieved along with the main
+ // contents of the document. This may include any images contained in the
+ // document, and possibly any external stylesheets.
+ LinkedHashSet<String> associatedFiles;
+
+ // A list of URLs of frames that are contained in the document.
+ LinkedHashSet<String> documentFrames;
+
+ // A list of URLs of links that are contained in the document.
+ LinkedHashSet<String> documentLinks;
+
+ // A list of URLs of images that are contained in the document.
+ LinkedHashSet<String> documentImages;
+
+ // A regular expression pattern that can be used to extract a URI from an
+ // HREF
+ // tag.
+ Pattern hrefPattern;
+
+ // A regular expression pattern that can be used to extract a URI from a SRC
+ // tag.
+ Pattern srcPattern;
+
+ // The base URL for relative links in this document.
+ String baseURL;
+
+ // The URL that may be used to access this document.
+ String documentURL;
+
+ // The actual contents of the page.
+ String htmlData;
+
+ // The contents of the page converted to lowercase for easier matching.
+ String lowerData;
+
+ // The URL for this document with only protocol, host, and port (i.e., no
+ // file).
+ String protocolHostPort;
+
+ // A string buffer containing the contents of the page with tags removed.
+ StringBuffer textData;
+
+ // A set of private variables used for internal processing.
+ private boolean lastElementIsAssociatedFile;
+ private boolean lastElementIsChunk;
+ private boolean lastElementIsComment;
+ private boolean lastElementIsFrame;
+ private boolean lastElementIsImage;
+ private boolean lastElementIsLink;
+ private boolean lastElementIsText;
+ private int lastElementEndPos;
+ private int lastElementStartPos;
+ private String lastURL;
+
+ // constructor that helps to parse without url stuff
+ public HTMLDocument(String htmlData) {
+ this.documentURL = null;
+ this.htmlData = htmlData;
+ lowerData = htmlData.toLowerCase();
+ associatedFiles = null;
+ documentLinks = null;
+ documentImages = null;
+ textData = null;
+ parsed = false;
+
+ // Create the regex patterns that we will use for extracting URIs from
+ // tags.
+ hrefPattern = Pattern.compile(".*?[hH][rR][eE][fF][\\s=\\\"\\']+"
+ + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL);
+ srcPattern = Pattern.compile(".*?[sS][rR][cC][\\s=\\\"\\']+"
+ + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL);
}
- }
-
-
- /**
- * Actually parses the HTML document and extracts useful elements from it.
- *
- * @return <CODE>true</CODE> if the page could be parsed successfully, or
- * <CODE>false</CODE> if not.
- */
- public boolean parse()
- {
- if (parsed)
- {
- return true;
- }
+ /**
+ * Creates a new HTML document using the provided data.
+ *
+ * @param documentURL The URL for this document.
+ * @param htmlData The actual data contained in the HTML document.
+ */
+ public HTMLDocument(String documentURL, String htmlData)
+ throws MalformedURLException {
+ this.documentURL = documentURL;
+ this.htmlData = htmlData;
+ lowerData = htmlData.toLowerCase();
+ associatedFiles = null;
+ documentLinks = null;
+ documentImages = null;
+ textData = null;
+ parsed = false;
+
+ // Create the regex patterns that we will use for extracting URIs from
+ // tags.
+ hrefPattern = Pattern.compile(".*?[hH][rR][eE][fF][\\s=\\\"\\']+"
+ + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL);
+ srcPattern = Pattern.compile(".*?[sS][rR][cC][\\s=\\\"\\']+"
+ + "([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL);
+
+ URL url = new URL(documentURL);
+ String urlPath = url.getPath();
+ if ((urlPath == null) || (urlPath.length() == 0)) {
+ baseURL = documentURL;
+ protocolHostPort = documentURL;
+ } else if (urlPath.equals("/")) {
+ baseURL = documentURL;
+ protocolHostPort = documentURL.substring(0,
+ documentURL.length() - 1);
+ } else if (urlPath.endsWith("/")) {
+ baseURL = documentURL;
+
+ int port = url.getPort();
+ if (port > 0) {
+ protocolHostPort = url.getProtocol() + "://" + url.getHost()
+ + ":" + port;
+ } else {
+ protocolHostPort = url.getProtocol() + "://" + url.getHost();
+ }
+ } else {
+ int port = url.getPort();
+ if (port > 0) {
+ protocolHostPort = url.getProtocol() + "://" + url.getHost()
+ + ":" + port;
+ } else {
+ protocolHostPort = url.getProtocol() + "://" + url.getHost();
+ }
+ File urlFile = new File(urlPath);
+ String parentDirectory = urlFile.getParent();
+ if ((parentDirectory == null) || (parentDirectory.length() == 0)) {
+ parentDirectory = "/";
+ } else if (!parentDirectory.startsWith("/")) {
+ parentDirectory = "/" + parentDirectory;
+ }
- try
- {
- associatedFiles = new LinkedHashSet<String>();
- documentFrames = new LinkedHashSet<String>();
- documentLinks = new LinkedHashSet<String>();
- documentImages = new LinkedHashSet<String>();
- textData = new StringBuffer();
-
- lastElementStartPos = 0;
- lastElementEndPos = -1;
- String element;
- while ((element = nextDocumentElement()) != null)
- {
- if (element.length() == 0)
- {
- continue;
+ baseURL = protocolHostPort + parentDirectory;
}
- if (lastElementIsText)
- {
- char lastChar;
- if (textData.length() == 0)
- {
- lastChar = ' ';
- }
- else
- {
- lastChar = textData.charAt(textData.length()-1);
- }
- char firstChar = element.charAt(0);
- if (! ((lastChar == ' ') || (lastChar == '\t') ||
- (lastChar == '\r') || (lastChar == '\n')) ||
- (firstChar == ' ') || (firstChar == '\t') ||
- (firstChar == '\r') || (firstChar == '\n'))
- {
- textData.append(" ");
- }
-
- textData.append(element);
- }
- else if (lastElementIsImage)
- {
- if (lastURL != null)
- {
- documentImages.add(lastURL);
- associatedFiles.add(lastURL);
- }
- }
- else if (lastElementIsFrame)
- {
- if (lastURL != null)
- {
- documentFrames.add(lastURL);
- associatedFiles.add(lastURL);
- }
- }
- else if (lastElementIsLink)
- {
- if (lastURL != null)
- {
- documentLinks.add(lastURL);
- }
- }
- else if (lastElementIsAssociatedFile)
- {
- if (lastURL != null)
- {
- associatedFiles.add(lastURL);
- }
- }
- else if (lastElementIsChunk || lastElementIsComment)
- {
- // Don't need to do anything with this.
+ if (!baseURL.endsWith("/")) {
+ baseURL = baseURL + "/";
}
- else
- {
- // Also don't need anything here.
- }
- }
-
- parsed = true;
- }
- catch (Exception e)
- {
- associatedFiles = null;
- documentLinks = null;
- documentImages = null;
- textData = null;
- parsed = false;
}
- return parsed;
- }
+ /**
+ * Actually parses the HTML document and extracts useful elements from it.
+ *
+ * @return <CODE>true</CODE> if the page could be parsed successfully, or
+ * <CODE>false</CODE> if not.
+ */
+ public boolean parse() {
+ if (parsed) {
+ return true;
+ }
+ try {
+ associatedFiles = new LinkedHashSet<String>();
+ documentFrames = new LinkedHashSet<String>();
+ documentLinks = new LinkedHashSet<String>();
+ documentImages = new LinkedHashSet<String>();
+ textData = new StringBuffer();
+
+ lastElementStartPos = 0;
+ lastElementEndPos = -1;
+ String element;
+ while ((element = nextDocumentElement()) != null) {
+ if (element.length() == 0) {
+ continue;
+ }
+
+ if (lastElementIsText) {
+ char lastChar;
+ if (textData.length() == 0) {
+ lastChar = ' ';
+ } else {
+ lastChar = textData.charAt(textData.length() - 1);
+ }
+ char firstChar = element.charAt(0);
+ if (!((lastChar == ' ') || (lastChar == '\t')
+ || (lastChar == '\r') || (lastChar == '\n'))
+ || (firstChar == ' ')
+ || (firstChar == '\t')
+ || (firstChar == '\r') || (firstChar == '\n')) {
+ textData.append(" ");
+ }
+
+ textData.append(element);
+ } else if (lastElementIsImage) {
+ if (lastURL != null) {
+ documentImages.add(lastURL);
+ associatedFiles.add(lastURL);
+ }
+ } else if (lastElementIsFrame) {
+ if (lastURL != null) {
+ documentFrames.add(lastURL);
+ associatedFiles.add(lastURL);
+ }
+ } else if (lastElementIsLink) {
+ if (lastURL != null) {
+ documentLinks.add(lastURL);
+ }
+ } else if (lastElementIsAssociatedFile) {
+ if (lastURL != null) {
+ associatedFiles.add(lastURL);
+ }
+ } else if (lastElementIsChunk || lastElementIsComment) {
+ // Don't need to do anything with this.
+ } else {
+ // Also don't need anything here.
+ }
+ }
+ parsed = true;
+ } catch (Exception e) {
+ associatedFiles = null;
+ documentLinks = null;
+ documentImages = null;
+ textData = null;
+ parsed = false;
+ }
- /**
- * Retrieves the next element from the HTML document. An HTML element can
- * include a string of plain text, a single HTML tag, or a larger chunk of
- * HTML including a start and end tag, all of which should be considered a
- * single element.
- */
- private String nextDocumentElement()
- {
- // If we're at the end of the HTML, then return null.
- if (lastElementEndPos >= htmlData.length())
- {
- return null;
+ return parsed;
}
+ /**
+ * Retrieves the next element from the HTML document. An HTML element can
+ * include a string of plain text, a single HTML tag, or a larger chunk of
+ * HTML including a start and end tag, all of which should be considered a
+ * single element.
+ */
+ private String nextDocumentElement() {
+ // If we're at the end of the HTML, then return null.
+ if (lastElementEndPos >= htmlData.length()) {
+ return null;
+ }
- // Initialize the variables we will use for the search.
- lastElementStartPos = lastElementEndPos+1;
- lastElementIsAssociatedFile = false;
- lastElementIsChunk = false;
- lastElementIsComment = false;
- lastElementIsFrame = false;
- lastElementIsImage = false;
- lastElementIsLink = false;
- lastElementIsText = false;
- lastURL = null;
-
-
- // Find the location of the next open angle bracket. If there is none, then
- // the rest of the document must be plain text.
- int openPos = lowerData.indexOf('<', lastElementStartPos);
- if (openPos < 0)
- {
- lastElementEndPos = htmlData.length();
- lastElementIsText = true;
- return htmlData.substring(lastElementStartPos);
- }
+ // Initialize the variables we will use for the search.
+ lastElementStartPos = lastElementEndPos + 1;
+ lastElementIsAssociatedFile = false;
+ lastElementIsChunk = false;
+ lastElementIsComment = false;
+ lastElementIsFrame = false;
+ lastElementIsImage = false;
+ lastElementIsLink = false;
+ lastElementIsText = false;
+ lastURL = null;
+
+ // Find the location of the next open angle bracket. If there is none,
+ // then
+ // the rest of the document must be plain text.
+ int openPos = lowerData.indexOf('<', lastElementStartPos);
+ if (openPos < 0) {
+ lastElementEndPos = htmlData.length();
+ lastElementIsText = true;
+ return htmlData.substring(lastElementStartPos);
+ }
+ // If the location of the next open tag is not we started looking, then
+ // read
+ // everything up to that tag as text.
+ if (openPos > lastElementStartPos) {
+ lastElementEndPos = openPos - 1;
+ lastElementIsText = true;
+ return htmlData.substring(lastElementStartPos, openPos);
+ }
- // If the location of the next open tag is not we started looking, then read
- // everything up to that tag as text.
- if (openPos > lastElementStartPos)
- {
- lastElementEndPos = openPos-1;
- lastElementIsText = true;
- return htmlData.substring(lastElementStartPos, openPos);
- }
+ // The start position is an open tag. See if the tag is actually "<!--",
+ // which indicates an HTML comment. If that's the case, then find the
+ // closing "-->".
+ if (openPos == lowerData.indexOf("<!--", lastElementStartPos)) {
+ int closePos = lowerData.indexOf("-->", openPos + 1);
+ if (closePos < 0) {
+ // This looks like an unterminated comment. We can't do much
+ // else
+ // here, so just stop parsing.
+ return null;
+ } else {
+ lastElementEndPos = closePos + 2;
+ lastElementIsComment = true;
+ return htmlData.substring(lastElementStartPos,
+ lastElementEndPos + 1);
+ }
+ }
+ // Find the location of the next close angle bracket. If there is none,
+ // then we have an unmatched open tag. What to do here? I guess just
+ // treat
+ // the rest of the document as text.
+ int closePos = lowerData.indexOf('>', openPos + 1);
+ if (closePos < 0) {
+ lastElementEndPos = htmlData.length();
+ lastElementIsText = true;
+ return htmlData.substring(lastElementStartPos);
+ }
- // The start position is an open tag. See if the tag is actually "<!--",
- // which indicates an HTML comment. If that's the case, then find the
- // closing "-->".
- if (openPos == lowerData.indexOf("<!--", lastElementStartPos))
- {
- int closePos = lowerData.indexOf("-->", openPos+1);
- if (closePos < 0)
- {
- // This looks like an unterminated comment. We can't do much else
- // here, so just stop parsing.
- return null;
- }
- else
- {
- lastElementEndPos = closePos + 2;
- lastElementIsComment = true;
- return htmlData.substring(lastElementStartPos, lastElementEndPos+1);
- }
- }
+ // Grab the contents of the tag in both normal and lowercase.
+ String tag = htmlData.substring(openPos, closePos + 1);
+ String strippedTag = htmlData.substring(openPos + 1, closePos).trim();
+ StringTokenizer tokenizer = new StringTokenizer(strippedTag,
+ " \t\r\n=\"'");
+ lastElementEndPos = closePos;
+ if (!tokenizer.hasMoreTokens()) {
+ return tag;
+ }
- // Find the location of the next close angle bracket. If there is none,
- // then we have an unmatched open tag. What to do here? I guess just treat
- // the rest of the document as text.
- int closePos = lowerData.indexOf('>', openPos+1);
- if (closePos < 0)
- {
- lastElementEndPos = htmlData.length();
- lastElementIsText = true;
- return htmlData.substring(lastElementStartPos);
- }
+ String token = tokenizer.nextToken();
+ String lowerToken = token.toLowerCase();
+
+ if (lowerToken.equals("a") || lowerToken.equals("area")) {
+ while (tokenizer.hasMoreTokens()) {
+ token = tokenizer.nextToken();
+ if (token.equalsIgnoreCase("href")) {
+ try {
+ Matcher matcher = hrefPattern.matcher(tag);
+ lastURL = uriToURL(matcher.replaceAll("$1"));
+ if (lastURL != null) {
+ lastElementIsLink = true;
+ }
+ } catch (Exception e) {
+ }
+ break;
+ }
+ }
+ } else if (lowerToken.equals("base")) {
+ while (tokenizer.hasMoreTokens()) {
+ token = tokenizer.nextToken();
+ if (token.equalsIgnoreCase("href")) {
+ try {
+ Matcher matcher = hrefPattern.matcher(tag);
+ String uri = matcher.replaceAll("$1");
+ if (!uri.endsWith("/")) {
+ uri = uri + "/";
+ }
+
+ baseURL = uri;
+ } catch (Exception e) {
+ }
+ break;
+ }
+ }
+ } else if (lowerToken.equals("frame") || lowerToken.equals("iframe")
+ || lowerToken.equals("input")) {
+ while (tokenizer.hasMoreTokens()) {
+ token = tokenizer.nextToken();
+ if (token.equalsIgnoreCase("src")) {
+ try {
+ Matcher matcher = srcPattern.matcher(tag);
+ String uri = matcher.replaceAll("$1");
+ lastURL = uriToURL(uri);
+ if (lastURL != null) {
+ lastElementIsFrame = true;
+ lastElementIsAssociatedFile = true;
+ }
+ } catch (Exception e) {
+ }
+ break;
+ }
+ }
+ } else if (lowerToken.equals("img")) {
+ while (tokenizer.hasMoreTokens()) {
+ token = tokenizer.nextToken();
+ if (token.equalsIgnoreCase("src")) {
+ try {
+ Matcher matcher = srcPattern.matcher(tag);
+ String uri = matcher.replaceAll("$1");
+ lastURL = uriToURL(uri);
+ if (lastURL != null) {
+ lastElementIsImage = true;
+ }
+ } catch (Exception e) {
+ }
+ break;
+ }
+ }
+ } else if (lowerToken.equals("link")) {
+ boolean isStyleSheet = false;
+
+ while (tokenizer.hasMoreTokens()) {
+ token = tokenizer.nextToken();
+ if (token.equalsIgnoreCase("href")) {
+ try {
+ Matcher matcher = hrefPattern.matcher(tag);
+ String uri = matcher.replaceAll("$1");
+ lastURL = uriToURL(uri);
+ if (lastURL != null) {
+ lastElementIsLink = true;
+ }
+ } catch (Exception e) {
+ }
+ break;
+ } else if (token.equalsIgnoreCase("rel")) {
+ if (tokenizer.hasMoreTokens()) {
+ String relType = tokenizer.nextToken();
+ if (relType.equalsIgnoreCase("stylesheet")) {
+ isStyleSheet = true;
+ }
+ }
+ }
+ }
+ if (lastURL != null) {
+ if (isStyleSheet) {
+ lastElementIsAssociatedFile = true;
+ } else {
+ lastElementIsLink = true;
+ }
+ }
+ } else if (lowerToken.equals("script")) {
+ while (tokenizer.hasMoreTokens()) {
+ token = tokenizer.nextToken();
+ if (token.equalsIgnoreCase("src")) {
+ try {
+ Matcher matcher = srcPattern.matcher(tag);
+ String uri = matcher.replaceAll("$1");
+ lastURL = uriToURL(uri);
+ } catch (Exception e) {
+ }
+ break;
+ }
+ }
- // Grab the contents of the tag in both normal and lowercase.
- String tag = htmlData.substring(openPos, closePos+1);
- String strippedTag = htmlData.substring(openPos+1, closePos).trim();
- StringTokenizer tokenizer = new StringTokenizer(strippedTag, " \t\r\n=\"'");
- lastElementEndPos = closePos;
+ if (lastURL == null) {
+ int endScriptPos = lowerData.indexOf("</script>",
+ lastElementEndPos + 1);
+ if (endScriptPos > 0) {
+ lastElementEndPos = endScriptPos + 8;
+ tag = htmlData.substring(lastElementStartPos,
+ lastElementEndPos + 1);
+ lastElementIsChunk = true;
+ }
+ } else {
+ lastElementIsAssociatedFile = true;
+ }
+ }
- if (! tokenizer.hasMoreTokens())
- {
- return tag;
+ return tag;
}
- String token = tokenizer.nextToken();
- String lowerToken = token.toLowerCase();
-
- if (lowerToken.equals("a") || lowerToken.equals("area"))
- {
- while (tokenizer.hasMoreTokens())
- {
- token = tokenizer.nextToken();
- if (token.equalsIgnoreCase("href"))
- {
- try
- {
- Matcher matcher = hrefPattern.matcher(tag);
- lastURL = uriToURL(matcher.replaceAll("$1"));
- if (lastURL != null)
- {
- lastElementIsLink = true;
+ /**
+ * Converts the provided URI to a URL. The provided URI may be a URL
+ * already, or it may also be an absolute path on the server or a path
+ * relative to the base URL.
+ *
+ * @param uri The URI to convert to a URL.
+ *
+ * @return The URL based on the provided URI.
+ */
+ private String uriToURL(String uri) {
+ String url = null;
+
+ if (uri.indexOf("://") > 0) {
+ if (uri.startsWith("http")) {
+ url = uri;
}
- } catch (Exception e) {}
- break;
+ } else if (uri.startsWith("/")) {
+ url = protocolHostPort + uri;
+ } else {
+ url = baseURL + uri;
}
- }
+
+ return url;
}
- else if (lowerToken.equals("base"))
- {
- while (tokenizer.hasMoreTokens())
- {
- token = tokenizer.nextToken();
- if (token.equalsIgnoreCase("href"))
- {
- try
- {
- Matcher matcher = hrefPattern.matcher(tag);
- String uri = matcher.replaceAll("$1");
- if (! uri.endsWith("/"))
- {
- uri = uri + "/";
- }
- baseURL = uri;
- } catch (Exception e) {}
- break;
- }
- }
+ /**
+ * Retrieves the URL of this HTML document.
+ *
+ * @return The URL of this HTML document.
+ */
+ public String getDocumentURL() {
+ return documentURL;
}
- else if (lowerToken.equals("frame") || lowerToken.equals("iframe") ||
- lowerToken.equals("input"))
- {
- while (tokenizer.hasMoreTokens())
- {
- token = tokenizer.nextToken();
- if (token.equalsIgnoreCase("src"))
- {
- try
- {
- Matcher matcher = srcPattern.matcher(tag);
- String uri = matcher.replaceAll("$1");
- lastURL = uriToURL(uri);
- if (lastURL != null)
- {
- lastElementIsFrame = true;
- lastElementIsAssociatedFile = true;
- }
- } catch (Exception e) {}
- break;
- }
- }
+
+ /**
+ * Retrieves the original HTML data used to create this document.
+ *
+ * @return The orginal HTML data used to create this document.
+ */
+ public String getHTMLData() {
+ return htmlData;
}
- else if (lowerToken.equals("img"))
- {
- while (tokenizer.hasMoreTokens())
- {
- token = tokenizer.nextToken();
- if (token.equalsIgnoreCase("src"))
- {
- try
- {
- Matcher matcher = srcPattern.matcher(tag);
- String uri = matcher.replaceAll("$1");
- lastURL = uriToURL(uri);
- if (lastURL != null)
- {
- lastElementIsImage = true;
+
+ /**
+ * Retrieves the contents of the HTML document with all tags removed.
+ *
+ * @return The contents of the HTML document with all tags removed, or
+ * <CODE>null</CODE> if a problem occurs while trying to parse the
+ * HTML.
+ */
+ public String getTextData() {
+ if (!parsed) {
+ if (!parse()) {
+ return null;
}
- } catch (Exception e) {}
- break;
}
- }
+
+ return textData.toString();
}
- else if (lowerToken.equals("link"))
- {
- boolean isStyleSheet = false;
-
- while (tokenizer.hasMoreTokens())
- {
- token = tokenizer.nextToken();
- if (token.equalsIgnoreCase("href"))
- {
- try
- {
- Matcher matcher = hrefPattern.matcher(tag);
- String uri = matcher.replaceAll("$1");
- lastURL = uriToURL(uri);
- if (lastURL != null)
- {
- lastElementIsLink = true;
- }
- } catch (Exception e) {}
- break;
- }
- else if (token.equalsIgnoreCase("rel"))
- {
- if (tokenizer.hasMoreTokens())
- {
- String relType = tokenizer.nextToken();
- if (relType.equalsIgnoreCase("stylesheet"))
- {
- isStyleSheet = true;
+
+ /**
+ * Retrieves an array containing a set of URLs parsed from the HTML document
+ * that reference files that would normally be downloaded as part of
+ * retrieving a page in a browser. This includes images and external style
+ * sheets.
+ *
+ * @return An array containing a set of URLs to files associated with the
+ * HTML document, or <CODE>null</CODE> if a problem occurs while
+ * trying to parse the HTML.
+ */
+ public String[] getAssociatedFiles() {
+ if (!parsed) {
+ if (!parse()) {
+ return null;
}
- }
}
- }
- if (lastURL != null)
- {
- if (isStyleSheet)
- {
- lastElementIsAssociatedFile = true;
- }
- else
- {
- lastElementIsLink = true;
- }
- }
- }
- else if (lowerToken.equals("script"))
- {
- while (tokenizer.hasMoreTokens())
- {
- token = tokenizer.nextToken();
- if (token.equalsIgnoreCase("src"))
- {
- try
- {
- Matcher matcher = srcPattern.matcher(tag);
- String uri = matcher.replaceAll("$1");
- lastURL = uriToURL(uri);
- } catch (Exception e) {}
- break;
- }
- }
-
- if (lastURL == null)
- {
- int endScriptPos = lowerData.indexOf("</script>", lastElementEndPos+1);
- if (endScriptPos > 0)
- {
- lastElementEndPos = endScriptPos + 8;
- tag = htmlData.substring(lastElementStartPos, lastElementEndPos+1);
- lastElementIsChunk = true;
- }
- }
- else
- {
- lastElementIsAssociatedFile = true;
- }
+ String[] urlArray = new String[associatedFiles.size()];
+ associatedFiles.toArray(urlArray);
+ return urlArray;
}
- return tag;
- }
-
-
-
- /**
- * Converts the provided URI to a URL. The provided URI may be a URL already,
- * or it may also be an absolute path on the server or a path relative to the
- * base URL.
- *
- * @param uri The URI to convert to a URL.
- *
- * @return The URL based on the provided URI.
- */
- private String uriToURL(String uri)
- {
- String url = null;
-
- if (uri.indexOf("://") > 0)
- {
- if (uri.startsWith("http"))
- {
- url = uri;
- }
- }
- else if (uri.startsWith("/"))
- {
- url = protocolHostPort + uri;
- }
- else
- {
- url = baseURL + uri;
- }
+ /**
+ * Retrieves an array containing a set of URLs parsed from the HTML document
+ * that are in the form of links to other content.
+ *
+ * @return An array containing a set of URLs parsed from the HTML document
+ * that are in the form of links to other content, or
+ * <CODE>null</CODE> if a problem occurs while trying to parse the
+ * HTML.
+ */
+ public String[] getDocumentLinks() {
+ if (!parsed) {
+ if (!parse()) {
+ return null;
+ }
+ }
- return url;
- }
-
-
-
- /**
- * Retrieves the URL of this HTML document.
- *
- * @return The URL of this HTML document.
- */
- public String getDocumentURL()
- {
- return documentURL;
- }
-
-
-
- /**
- * Retrieves the original HTML data used to create this document.
- *
- * @return The orginal HTML data used to create this document.
- */
- public String getHTMLData()
- {
- return htmlData;
- }
-
-
-
- /**
- * Retrieves the contents of the HTML document with all tags removed.
- *
- * @return The contents of the HTML document with all tags removed, or
- * <CODE>null</CODE> if a problem occurs while trying to parse the
- * HTML.
- */
- public String getTextData()
- {
- if (! parsed)
- {
- if (! parse())
- {
- return null;
- }
+ String[] urlArray = new String[documentLinks.size()];
+ documentLinks.toArray(urlArray);
+ return urlArray;
}
- return textData.toString();
- }
-
-
-
- /**
- * Retrieves an array containing a set of URLs parsed from the HTML document
- * that reference files that would normally be downloaded as part of
- * retrieving a page in a browser. This includes images and external style
- * sheets.
- *
- * @return An array containing a set of URLs to files associated with the
- * HTML document, or <CODE>null</CODE> if a problem occurs while
- * trying to parse the HTML.
- */
- public String[] getAssociatedFiles()
- {
- if (! parsed)
- {
- if (! parse())
- {
- return null;
- }
- }
+ /**
+ * Retrieves an array containing a set of URLs parsed from the HTML document
+ * that reference images used in the document.
+ *
+ * @return An array containing a set of URLs parsed from the HTML document
+ * that reference images used in the document.
+ */
+ public String[] getDocumentImages() {
+ if (!parsed) {
+ if (!parse()) {
+ return null;
+ }
+ }
- String[] urlArray = new String[associatedFiles.size()];
- associatedFiles.toArray(urlArray);
- return urlArray;
- }
-
-
-
- /**
- * Retrieves an array containing a set of URLs parsed from the HTML document
- * that are in the form of links to other content.
- *
- * @return An array containing a set of URLs parsed from the HTML document
- * that are in the form of links to other content, or
- * <CODE>null</CODE> if a problem occurs while trying to parse the
- * HTML.
- */
- public String[] getDocumentLinks()
- {
- if (! parsed)
- {
- if (! parse())
- {
- return null;
- }
+ String[] urlArray = new String[documentImages.size()];
+ documentImages.toArray(urlArray);
+ return urlArray;
}
- String[] urlArray = new String[documentLinks.size()];
- documentLinks.toArray(urlArray);
- return urlArray;
- }
-
-
-
- /**
- * Retrieves an array containing a set of URLs parsed from the HTML document
- * that reference images used in the document.
- *
- * @return An array containing a set of URLs parsed from the HTML document
- * that reference images used in the document.
- */
- public String[] getDocumentImages()
- {
- if (! parsed)
- {
- if (! parse())
- {
- return null;
- }
- }
+ /**
+ * Retrieves an array containing a set of URLs parsed from the HTML document
+ * that reference frames used in the document.
+ *
+ * @return An array containing a set of URLs parsed from the HTML document
+ * that reference frames used in the document.
+ */
+ public String[] getDocumentFrames() {
+ if (!parsed) {
+ if (!parse()) {
+ return null;
+ }
+ }
- String[] urlArray = new String[documentImages.size()];
- documentImages.toArray(urlArray);
- return urlArray;
- }
-
-
-
- /**
- * Retrieves an array containing a set of URLs parsed from the HTML document
- * that reference frames used in the document.
- *
- * @return An array containing a set of URLs parsed from the HTML document
- * that reference frames used in the document.
- */
- public String[] getDocumentFrames()
- {
- if (! parsed)
- {
- if (! parse())
- {
- return null;
- }
+ String[] urlArray = new String[documentFrames.size()];
+ documentFrames.toArray(urlArray);
+ return urlArray;
}
-
- String[] urlArray = new String[documentFrames.size()];
- documentFrames.toArray(urlArray);
- return urlArray;
- }
}
-