// --- BEGIN COPYRIGHT BLOCK ---
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; version 2 of the License.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this program; if not, write to the Free Software Foundation, Inc.,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
//
// (C) 2007 Red Hat, Inc.
// All rights reserved.
// --- END COPYRIGHT BLOCK ---
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
public class HTMLDocument
{
// Indicates whether this HTML document has been parsed.
boolean parsed;
// A list of URLs of files that should be retrieved along with the main
// contents of the document. This may include any images contained in the
// document, and possibly any external stylesheets.
LinkedHashSet associatedFiles;
// A list of URLs of frames that are contained in the document.
LinkedHashSet documentFrames;
// A list of URLs of links that are contained in the document.
LinkedHashSet documentLinks;
// A list of URLs of images that are contained in the document.
LinkedHashSet documentImages;
// A regular expression pattern that can be used to extract a URI from an HREF
// tag.
Pattern hrefPattern;
// A regular expression pattern that can be used to extract a URI from a SRC
// tag.
Pattern srcPattern;
// The base URL for relative links in this document.
String baseURL;
// The URL that may be used to access this document.
String documentURL;
// The actual contents of the page.
String htmlData;
// The contents of the page converted to lowercase for easier matching.
String lowerData;
// The URL for this document with only protocol, host, and port (i.e., no
// file).
String protocolHostPort;
// A string buffer containing the contents of the page with tags removed.
StringBuffer textData;
// A set of private variables used for internal processing.
private boolean lastElementIsAssociatedFile;
private boolean lastElementIsChunk;
private boolean lastElementIsComment;
private boolean lastElementIsFrame;
private boolean lastElementIsImage;
private boolean lastElementIsLink;
private boolean lastElementIsText;
private int lastElementEndPos;
private int lastElementStartPos;
private String lastURL;
// constructor that helps to parse without url stuff
public HTMLDocument(String htmlData)
{
this.documentURL = null;
this.htmlData = htmlData;
lowerData = htmlData.toLowerCase();
associatedFiles = null;
documentLinks = null;
documentImages = null;
textData = null;
parsed = false;
// Create the regex patterns that we will use for extracting URIs from tags.
hrefPattern = Pattern.compile(".*?[hH][rR][eE][fF][\\s=\\\"\\']+" +
"([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL);
srcPattern = Pattern.compile(".*?[sS][rR][cC][\\s=\\\"\\']+" +
"([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL);
}
/**
* Creates a new HTML document using the provided data.
*
* @param documentURL The URL for this document.
* @param htmlData The actual data contained in the HTML document.
*/
public HTMLDocument(String documentURL, String htmlData)
throws MalformedURLException
{
this.documentURL = documentURL;
this.htmlData = htmlData;
lowerData = htmlData.toLowerCase();
associatedFiles = null;
documentLinks = null;
documentImages = null;
textData = null;
parsed = false;
// Create the regex patterns that we will use for extracting URIs from tags.
hrefPattern = Pattern.compile(".*?[hH][rR][eE][fF][\\s=\\\"\\']+" +
"([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL);
srcPattern = Pattern.compile(".*?[sS][rR][cC][\\s=\\\"\\']+" +
"([^\\s\\\"\\'\\>]+).*", Pattern.DOTALL);
URL url = new URL(documentURL);
String urlPath = url.getPath();
if ((urlPath == null) || (urlPath.length() == 0))
{
baseURL = documentURL;
protocolHostPort = documentURL;
}
else if (urlPath.equals("/"))
{
baseURL = documentURL;
protocolHostPort = documentURL.substring(0, documentURL.length()-1);
}
else if (urlPath.endsWith("/"))
{
baseURL = documentURL;
int port = url.getPort();
if (port > 0)
{
protocolHostPort = url.getProtocol() + "://" + url.getHost() + ":" +
port;
}
else
{
protocolHostPort = url.getProtocol() + "://" + url.getHost();
}
}
else
{
int port = url.getPort();
if (port > 0)
{
protocolHostPort = url.getProtocol() + "://" + url.getHost() + ":" +
port;
}
else
{
protocolHostPort = url.getProtocol() + "://" + url.getHost();
}
File urlFile = new File(urlPath);
String parentDirectory = urlFile.getParent();
if ((parentDirectory == null) || (parentDirectory.length() == 0))
{
parentDirectory = "/";
}
else if (! parentDirectory.startsWith("/"))
{
parentDirectory = "/" + parentDirectory;
}
baseURL = protocolHostPort + parentDirectory;
}
if (! baseURL.endsWith("/"))
{
baseURL = baseURL + "/";
}
}
/**
* Actually parses the HTML document and extracts useful elements from it.
*
* @return true if the page could be parsed successfully, or
* false if not.
*/
public boolean parse()
{
if (parsed)
{
return true;
}
try
{
associatedFiles = new LinkedHashSet();
documentFrames = new LinkedHashSet();
documentLinks = new LinkedHashSet();
documentImages = new LinkedHashSet();
textData = new StringBuffer();
lastElementStartPos = 0;
lastElementEndPos = -1;
String element;
while ((element = nextDocumentElement()) != null)
{
if (element.length() == 0)
{
continue;
}
if (lastElementIsText)
{
char lastChar;
if (textData.length() == 0)
{
lastChar = ' ';
}
else
{
lastChar = textData.charAt(textData.length()-1);
}
char firstChar = element.charAt(0);
if (! ((lastChar == ' ') || (lastChar == '\t') ||
(lastChar == '\r') || (lastChar == '\n')) ||
(firstChar == ' ') || (firstChar == '\t') ||
(firstChar == '\r') || (firstChar == '\n'))
{
textData.append(" ");
}
textData.append(element);
}
else if (lastElementIsImage)
{
if (lastURL != null)
{
documentImages.add(lastURL);
associatedFiles.add(lastURL);
}
}
else if (lastElementIsFrame)
{
if (lastURL != null)
{
documentFrames.add(lastURL);
associatedFiles.add(lastURL);
}
}
else if (lastElementIsLink)
{
if (lastURL != null)
{
documentLinks.add(lastURL);
}
}
else if (lastElementIsAssociatedFile)
{
if (lastURL != null)
{
associatedFiles.add(lastURL);
}
}
else if (lastElementIsChunk || lastElementIsComment)
{
// Don't need to do anything with this.
}
else
{
// Also don't need anything here.
}
}
parsed = true;
}
catch (Exception e)
{
associatedFiles = null;
documentLinks = null;
documentImages = null;
textData = null;
parsed = false;
}
return parsed;
}
/**
* Retrieves the next element from the HTML document. An HTML element can
* include a string of plain text, a single HTML tag, or a larger chunk of
* HTML including a start and end tag, all of which should be considered a
* single element.
*/
private String nextDocumentElement()
{
// If we're at the end of the HTML, then return null.
if (lastElementEndPos >= htmlData.length())
{
return null;
}
// Initialize the variables we will use for the search.
lastElementStartPos = lastElementEndPos+1;
lastElementIsAssociatedFile = false;
lastElementIsChunk = false;
lastElementIsComment = false;
lastElementIsFrame = false;
lastElementIsImage = false;
lastElementIsLink = false;
lastElementIsText = false;
lastURL = null;
// Find the location of the next open angle bracket. If there is none, then
// the rest of the document must be plain text.
int openPos = lowerData.indexOf('<', lastElementStartPos);
if (openPos < 0)
{
lastElementEndPos = htmlData.length();
lastElementIsText = true;
return htmlData.substring(lastElementStartPos);
}
// If the location of the next open tag is not we started looking, then read
// everything up to that tag as text.
if (openPos > lastElementStartPos)
{
lastElementEndPos = openPos-1;
lastElementIsText = true;
return htmlData.substring(lastElementStartPos, openPos);
}
// The start position is an open tag. See if the tag is actually "".
if (openPos == lowerData.indexOf("", openPos+1);
if (closePos < 0)
{
// This looks like an unterminated comment. We can't do much else
// here, so just stop parsing.
return null;
}
else
{
lastElementEndPos = closePos + 2;
lastElementIsComment = true;
return htmlData.substring(lastElementStartPos, lastElementEndPos+1);
}
}
// Find the location of the next close angle bracket. If there is none,
// then we have an unmatched open tag. What to do here? I guess just treat
// the rest of the document as text.
int closePos = lowerData.indexOf('>', openPos+1);
if (closePos < 0)
{
lastElementEndPos = htmlData.length();
lastElementIsText = true;
return htmlData.substring(lastElementStartPos);
}
// Grab the contents of the tag in both normal and lowercase.
String tag = htmlData.substring(openPos, closePos+1);
String strippedTag = htmlData.substring(openPos+1, closePos).trim();
StringTokenizer tokenizer = new StringTokenizer(strippedTag, " \t\r\n=\"'");
lastElementEndPos = closePos;
if (! tokenizer.hasMoreTokens())
{
return tag;
}
String token = tokenizer.nextToken();
String lowerToken = token.toLowerCase();
if (lowerToken.equals("a") || lowerToken.equals("area"))
{
while (tokenizer.hasMoreTokens())
{
token = tokenizer.nextToken();
if (token.equalsIgnoreCase("href"))
{
try
{
Matcher matcher = hrefPattern.matcher(tag);
lastURL = uriToURL(matcher.replaceAll("$1"));
if (lastURL != null)
{
lastElementIsLink = true;
}
} catch (Exception e) {}
break;
}
}
}
else if (lowerToken.equals("base"))
{
while (tokenizer.hasMoreTokens())
{
token = tokenizer.nextToken();
if (token.equalsIgnoreCase("href"))
{
try
{
Matcher matcher = hrefPattern.matcher(tag);
String uri = matcher.replaceAll("$1");
if (! uri.endsWith("/"))
{
uri = uri + "/";
}
baseURL = uri;
} catch (Exception e) {}
break;
}
}
}
else if (lowerToken.equals("frame") || lowerToken.equals("iframe") ||
lowerToken.equals("input"))
{
while (tokenizer.hasMoreTokens())
{
token = tokenizer.nextToken();
if (token.equalsIgnoreCase("src"))
{
try
{
Matcher matcher = srcPattern.matcher(tag);
String uri = matcher.replaceAll("$1");
lastURL = uriToURL(uri);
if (lastURL != null)
{
lastElementIsFrame = true;
lastElementIsAssociatedFile = true;
}
} catch (Exception e) {}
break;
}
}
}
else if (lowerToken.equals("img"))
{
while (tokenizer.hasMoreTokens())
{
token = tokenizer.nextToken();
if (token.equalsIgnoreCase("src"))
{
try
{
Matcher matcher = srcPattern.matcher(tag);
String uri = matcher.replaceAll("$1");
lastURL = uriToURL(uri);
if (lastURL != null)
{
lastElementIsImage = true;
}
} catch (Exception e) {}
break;
}
}
}
else if (lowerToken.equals("link"))
{
boolean isStyleSheet = false;
while (tokenizer.hasMoreTokens())
{
token = tokenizer.nextToken();
if (token.equalsIgnoreCase("href"))
{
try
{
Matcher matcher = hrefPattern.matcher(tag);
String uri = matcher.replaceAll("$1");
lastURL = uriToURL(uri);
if (lastURL != null)
{
lastElementIsLink = true;
}
} catch (Exception e) {}
break;
}
else if (token.equalsIgnoreCase("rel"))
{
if (tokenizer.hasMoreTokens())
{
String relType = tokenizer.nextToken();
if (relType.equalsIgnoreCase("stylesheet"))
{
isStyleSheet = true;
}
}
}
}
if (lastURL != null)
{
if (isStyleSheet)
{
lastElementIsAssociatedFile = true;
}
else
{
lastElementIsLink = true;
}
}
}
else if (lowerToken.equals("script"))
{
while (tokenizer.hasMoreTokens())
{
token = tokenizer.nextToken();
if (token.equalsIgnoreCase("src"))
{
try
{
Matcher matcher = srcPattern.matcher(tag);
String uri = matcher.replaceAll("$1");
lastURL = uriToURL(uri);
} catch (Exception e) {}
break;
}
}
if (lastURL == null)
{
int endScriptPos = lowerData.indexOf("", lastElementEndPos+1);
if (endScriptPos > 0)
{
lastElementEndPos = endScriptPos + 8;
tag = htmlData.substring(lastElementStartPos, lastElementEndPos+1);
lastElementIsChunk = true;
}
}
else
{
lastElementIsAssociatedFile = true;
}
}
return tag;
}
/**
* Converts the provided URI to a URL. The provided URI may be a URL already,
* or it may also be an absolute path on the server or a path relative to the
* base URL.
*
* @param uri The URI to convert to a URL.
*
* @return The URL based on the provided URI.
*/
private String uriToURL(String uri)
{
String url = null;
if (uri.indexOf("://") > 0)
{
if (uri.startsWith("http"))
{
url = uri;
}
}
else if (uri.startsWith("/"))
{
url = protocolHostPort + uri;
}
else
{
url = baseURL + uri;
}
return url;
}
/**
* Retrieves the URL of this HTML document.
*
* @return The URL of this HTML document.
*/
public String getDocumentURL()
{
return documentURL;
}
/**
* Retrieves the original HTML data used to create this document.
*
* @return The orginal HTML data used to create this document.
*/
public String getHTMLData()
{
return htmlData;
}
/**
* Retrieves the contents of the HTML document with all tags removed.
*
* @return The contents of the HTML document with all tags removed, or
* null if a problem occurs while trying to parse the
* HTML.
*/
public String getTextData()
{
if (! parsed)
{
if (! parse())
{
return null;
}
}
return textData.toString();
}
/**
* Retrieves an array containing a set of URLs parsed from the HTML document
* that reference files that would normally be downloaded as part of
* retrieving a page in a browser. This includes images and external style
* sheets.
*
* @return An array containing a set of URLs to files associated with the
* HTML document, or null if a problem occurs while
* trying to parse the HTML.
*/
public String[] getAssociatedFiles()
{
if (! parsed)
{
if (! parse())
{
return null;
}
}
String[] urlArray = new String[associatedFiles.size()];
associatedFiles.toArray(urlArray);
return urlArray;
}
/**
* Retrieves an array containing a set of URLs parsed from the HTML document
* that are in the form of links to other content.
*
* @return An array containing a set of URLs parsed from the HTML document
* that are in the form of links to other content, or
* null if a problem occurs while trying to parse the
* HTML.
*/
public String[] getDocumentLinks()
{
if (! parsed)
{
if (! parse())
{
return null;
}
}
String[] urlArray = new String[documentLinks.size()];
documentLinks.toArray(urlArray);
return urlArray;
}
/**
* Retrieves an array containing a set of URLs parsed from the HTML document
* that reference images used in the document.
*
* @return An array containing a set of URLs parsed from the HTML document
* that reference images used in the document.
*/
public String[] getDocumentImages()
{
if (! parsed)
{
if (! parse())
{
return null;
}
}
String[] urlArray = new String[documentImages.size()];
documentImages.toArray(urlArray);
return urlArray;
}
/**
* Retrieves an array containing a set of URLs parsed from the HTML document
* that reference frames used in the document.
*
* @return An array containing a set of URLs parsed from the HTML document
* that reference frames used in the document.
*/
public String[] getDocumentFrames()
{
if (! parsed)
{
if (! parse())
{
return null;
}
}
String[] urlArray = new String[documentFrames.size()];
documentFrames.toArray(urlArray);
return urlArray;
}
}