|
JavaXT
|
|
Parser Class
package javaxt.html;
import java.util.ArrayList;
import static javaxt.utils.Console.console;
//******************************************************************************
//** HTML Parser
//******************************************************************************
/**
* Used to parse HTML documents and fragments and find DOM elements
*
******************************************************************************/
public class Parser {
private String html;
//**************************************************************************
//** Constructor
//**************************************************************************
public Parser(String html){
this.html = html;
}
//**************************************************************************
//** getHTML
//**************************************************************************
public String getHTML(){
return html;
}
//**************************************************************************
//** setHTML
//**************************************************************************
/** Used to reset the "scope" of the parser
*/
public void setHTML(String html){
this.html = html;
}
//**************************************************************************
//** getElementByID
//**************************************************************************
/** Returns an HTML Element with a given id. Returns null if the element was
* not found.
*/
public Element getElementByID(String id){
return getElementByAttributes(null, "id", id);
}
//**************************************************************************
//** getElementByTagName
//**************************************************************************
/** Returns an array of HTML Elements found in the HTML document with given
* tag name.
*/
public Element[] getElementsByTagName(String tagName){
String orgHTML = html;
ArrayList<Element> elements = new ArrayList<Element>();
Element e = getElementByTagName(tagName);
while (e!=null){
elements.add(e);
String outerHTML = e.getOuterHTML();
int idx = html.indexOf(outerHTML);
String a = html.substring(0, idx);
String b = html.substring(idx+outerHTML.length());
html = a + b;
e = getElementByTagName(tagName);
}
html = orgHTML;
return elements.toArray(new Element[elements.size()]);
}
//**************************************************************************
//** getElements
//**************************************************************************
/** Returns an array of top-level HTML Elements found in the HTML document
*/
public Element[] getElements(){
ArrayList<Element> elements = getElements(html);
return elements.toArray(new Element[elements.size()]);
}
//**************************************************************************
//** getElementByTagName
//**************************************************************************
/** Returns an array of HTML Elements found in the HTML document with given
* tag name, attribute, and attribute value (e.g. "div", "class", "hdr2").
*/
public Element[] getElements(String tagName, String attributeName, String attributeValue){
String orgHTML = html;
ArrayList<Element> elements = new ArrayList<Element>();
Element e = getElementByAttributes(tagName, attributeName, attributeValue);
while (e!=null){
elements.add(e);
String outerHTML = e.getOuterHTML();
int idx = html.indexOf(outerHTML);
String a = html.substring(0, idx);
String b = html.substring(idx+outerHTML.length());
html = a + b;
e = getElementByAttributes(tagName, attributeName, attributeValue);
}
html = orgHTML;
return elements.toArray(new Element[elements.size()]);
}
//**************************************************************************
//** getElementByTagName
//**************************************************************************
/** Returns the first HTML Element found in the HTML document with given tag
* name. Returns null if an element was not found.
*/
public Element getElementByTagName(String tagName){
return getElementByAttributes(tagName, null, null);
}
//**************************************************************************
//** getElementByAttributes
//**************************************************************************
/** Returns the first HTML Element found in the HTML document with given tag
* name and attribute. Returns null if an element was not found.
*/
public Element getElementByAttributes(String tagName, String attributeName, String attributeValue){
Element[] elements = getElements();
for (Element element : elements){
if (hasAttributes(tagName, attributeName, attributeValue, element)){
return element;
}
}
for (Element element : elements){
String n = element.getName();
if (n!=null){
if (!n.equalsIgnoreCase("script")){
Element e = getElementByAttributes(tagName, attributeName, attributeValue, element);
if (e!=null) return e;
}
}
}
return null;
}
private Element getElementByAttributes(String tagName, String attributeName, String attributeValue, Element element){
Element[] elements = element.getChildNodes();
for (Element e : elements){
if (hasAttributes(tagName, attributeName, attributeValue, e)){
return e;
}
}
for (Element e : elements){
String n = element.getName();
if (n!=null){
if (!n.equalsIgnoreCase("script")){
Element el = getElementByAttributes(tagName, attributeName, attributeValue, e);
if (el!=null) return el;
}
}
}
return null;
}
//**************************************************************************
//** hasAttributes
//**************************************************************************
/** Returns true if the given element matches the tagName, attributeName,
* and attributeValue
*/
private boolean hasAttributes(String tagName, String attributeName, String attributeValue, Element element){
String name = element.getName();
if (name==null) return false;
if (name.equalsIgnoreCase(tagName) || tagName==null){
if (attributeName==null) return true;
String val = element.getAttribute(attributeName);
if (val==null){
if (attributeValue==null) return true;
}
else{
if (val.equals(attributeValue)){
return true;
}
}
}
return false;
}
//**************************************************************************
//** getImageLinks
//**************************************************************************
/** Returns a list of links to images. The links may include relative paths.
* Use the getAbsolutePath method to resolve the relative paths to a fully
* qualified url.
*/
public String[] getImageLinks(){
ArrayList<String> links = new ArrayList<String>();
for (Element img : getElementsByTagName("img")){
String src = img.getAttribute("src");
if (src.length()>0) links.add(src);
}
return links.toArray(new String[links.size()]);
}
//**************************************************************************
//** stripHTMLTags
//**************************************************************************
/** Used to remove any html tags from a block of text
*/
public static String stripHTMLTags(String html){
String s = html + " ";
String c = "";
boolean concat = false;
String tag = "";
for (int i = 0; i < s.length(); i++){
c = s.substring(i,i+1);
if (c.equals("<")){
concat = true;
}
if (concat==true){
tag += c;
}
if (c.equals(">") && concat==true){
concat = false;
html = html.replace(tag,"");
//Clear tag variable for the next pass
tag = "";
}
}
//html = html.replaceAll("\\s+"," ");
return html.replace(" ", " ").trim();
}
//**************************************************************************
//** MapPath
//**************************************************************************
/** Returns a fully qualified URL for a given path. Returns null if the
* function fails to resolve the path.
* @param relPath Relative path to a file (e.g. "../images/header.jpg")
* @param url URL that is sourcing the relPath (e.g. "http://acme.com/about/")
* @return Using the examples cited in the 2 parameters, return a URL
* "http://acme.com/images/header.jpg"
*/
public static String MapPath(String relPath, java.net.URL url){
//Check if relPath is a fully qualified URL. If so, return the relPath.
try{
new java.net.URL(relPath);
return relPath;
}
catch(Exception e){}
//Remove "./" prefix in the relPath
if (relPath.length()>2){
if (relPath.substring(0,2).equals("./")){
relPath = relPath.substring(2,relPath.length());
}
}
String[] arrRelPath = relPath.split("/");
try{
String urlBase = url.getProtocol() + "://" + url.getHost();
int port = url.getPort();
if (port>0 && port!=80) urlBase+= ":" + url.getPort();
//Build Path
String urlPath = "";
String newPath;
if (relPath.substring(0,1).equals("/")){
newPath = relPath;
}
else{
urlPath = "/";
String[] arr = url.getPath().split("/");
for (int i=0; i<=(arr.length-arrRelPath.length); i++){
String dir = arr[i];
if (dir.length()>0){
urlPath += dir + "/";
}
}
//This can be cleaned-up a bit...
if (relPath.substring(0,1).equals("/")){
newPath = relPath.substring(1,relPath.length());
}
else if (relPath.substring(0,2).equals("./")){
newPath = relPath.substring(2,relPath.length());
}
else if (relPath.substring(0,3).equals("../")){
newPath = relPath.replace("../", "");
}
else{
newPath = relPath;
}
}
return urlBase + urlPath + newPath;
}
catch(Exception e){}
return null;
}
//**************************************************************************
//** getAbsolutePath
//**************************************************************************
/** Returns a fully qualified URL for a given path. See MapPath() method for
* more information.
* @deprecated Use MapPath()
*/
public static String getAbsolutePath(String relPath, String url){
try{
return MapPath(relPath, new java.net.URL(url));
}
catch(Exception e){}
return null;
}
//**************************************************************************
//** getElements
//**************************************************************************
/** Returns top-level nodes in a given HTML string
*/
private static ArrayList<Element> getElements(String s){
ArrayList<Element> elements = new ArrayList<Element>();
if (s==null) return elements;
boolean insideComment = false;
boolean insideQuote = false;
int start = 0;
int len = s.length();
for (int i=0; i<len; i++){
char c = s.charAt(i);
if (c=='<'){
if (!insideComment && !insideQuote){
if (i+3<len-1){
String str = s.substring(i, i+4);
if (str.equals("<!--")){
insideComment = true;
i += 3;
continue;
}
}
}
if (!insideComment && !insideQuote){
start = i;
}
}
else if (c=='>'){
if (insideComment && !insideQuote){
String str = s.substring(i-2, i+1);
if (str.equals("-->")){
insideComment = false;
continue;
}
}
if (!insideComment && !insideQuote){
int end = -1;
char p = s.charAt(i-1);
if (p=='/'){
end = i+1;
}
else{
try{
String str = s.substring(start, i+1);
Element el = new Element(str);
end = findEndTag(el.getName(), i+1, s);
if (end==-1){
end = i+1;
}
else{
end++;
}
}
catch(Exception e){} //shouldn't happen
}
if (end>-1){
String str = s.substring(start, end);
try{
Element el = new Element(str);
elements.add(el);
}
catch(Exception e){} //shouldn't happen
i = end;
start = i;
}
else{
//console.log("error finding end tag!");
}
}
}
else if (c=='"'){
if (!insideComment){
insideQuote = !insideQuote;
}
}
}
return elements;
}
//**************************************************************************
//** findEndTag
//**************************************************************************
/** Returns the position of an end tag corresponding to the given tagName
*/
private static int findEndTag(String tagName, int x, String s){
int numTags = 1;
int start = -1;
boolean insideComment = false;
boolean insideQuote = false;
boolean insideClosingTag = false;
ArrayList<Object[]> tags = new ArrayList<Object[]>();
int len = s.length();
for (int i=x; i<len; i++){
char c = s.charAt(i);
if (c=='<'){
if (!insideComment && !insideQuote){
if (i+3<len-1){
String str = s.substring(i, i+4);
if (str.equals("<!--")){
insideComment = true;
i += 3;
continue;
}
}
}
if (!insideComment && !insideQuote){
if (i+1<len-1){
char n = s.charAt(i+1);
if (n=='/'){
insideClosingTag = true;
}
}
start = i;
}
}
else if (c=='>'){
if (insideComment && !insideQuote){
String str = s.substring(i-2, i+1);
if (str.equals("-->")){
insideComment = false;
continue;
}
}
if (!insideComment && !insideQuote){
//Get tag name
String currTagName = "";
try{
String str;
if (insideClosingTag){
str = "<" + s.substring(start+2, i+1);
}
else{
str = s.substring(start, i+1);
}
Element el = new Element(str);
currTagName = el.getName();
tags.add(new Object[]{currTagName, insideClosingTag, start, i});
}
catch(Exception e){} //shouldn't happen
//Compare current tag to the target tag. If there's a match,
//update the numTags
if (currTagName.equals(tagName)){
char p = s.charAt(i-1);
if (p=='/'){
//self enclosing tag, don't update the numTags
}
else{
String t = s.substring(start, i+1);
if (insideClosingTag){
numTags--;
}
else{
numTags++;
}
}
}
//If numTags is 0, we have found the end!
if (numTags==0) return i;
//Update insideClosingTag variable as needed
if (insideClosingTag){
insideClosingTag = false;
}
}
}
else if (c=='"'){
if (!insideComment){
insideQuote = !insideQuote;
}
}
}
//Special case for tags like this: <div><div id="1"></div>
//In Chrome and Firefox, this translates to: <div><div id="1"></div></div>
//In this case, we want to return the position of the end of: </div>
if (!tags.isEmpty()){
Object[] nextTag = tags.get(0);
String nextTagName = (String) nextTag[0];
boolean isClosingTag = (Boolean) nextTag[1];
if (nextTagName.equals(tagName) && !isClosingTag){
Object[] lastTag = null;
for (int i=1; i<tags.size(); i++){
Object[] tag = tags.get(i);
String name = (String) tag[0];
if (name.equals(tagName)){
lastTag = tag;
}
else {
break;
}
}
if (lastTag!=null){
String lastTagName = (String) lastTag[0];
isClosingTag = (Boolean) lastTag[1];
if (lastTagName.equals(tagName) && isClosingTag){
return (Integer) lastTag[3];
}
}
}
}
return -1;
}
//**************************************************************************
//** findGT
//**************************************************************************
/** Returns the position of the first ">" character that is not a comment
* or inside a quote
*/
protected static int findGT(int x, String s){
boolean insideComment = false;
boolean insideQuote = false;
int len = s.length();
for (int i=x; i<len; i++){
char c = s.charAt(i);
if (c=='<'){
if (!insideComment && !insideQuote){
if (i+3<len-1){
String str = s.substring(i, i+4);
if (str.equals("<!--")){
insideComment = true;
i += 3;
}
}
}
}
else if (c=='>'){
if (insideComment && !insideQuote){
String str = s.substring(i-2, i+1);
if (str.equals("-->")){
insideComment = false;
continue;
}
}
if (!insideComment && !insideQuote){
return i;
}
}
else if (c=='"'){
if (!insideComment){
insideQuote = !insideQuote;
}
}
}
return -1;
}
}
|
|