江西广告网

标题: HTMLPage类基础说明 [打印本页]

作者: 幽忧    时间: 2009-2-28 11:12
标题: HTMLPage类基础说明
HTMLPage类: HTMLPage类中主要也就几种用途,而从HTMLPage类中抓取图片是一个非常重要的一个功能,诚然还有超链接和表单。而在HTMLPage类的内置类Parser中,大部分工作都是由handleSimpleTag(简单标签)和和handleStartTag(起始标签方法来完成。 详细代码清单如下: package com.heaton.bot; import java.util.*; import com.heaton.bot.*; import java.net.*; import java.io.*; import javax.swing.text.*; import javax.swing.text.html.*; /** * The HTMLPage class is used to parse an HTML page and store * that page, in a parsed form, in memory. * are exchanged with a webserver. */ public class HTMLPage { /** * A list of images on this page. */ protected Vector images = new Vector(); /** * A list of links on this page. */ protected Vector links = new Vector(); /** * A list of forms on this page. */ protected Vector forms = new Vector(); /** * The underlying HTTP object for this page. */ protected HTTP http; /** * The base URL to resolve relative URL's. */ protected String base; /** * Construct an HTMLPage object. * * @param http The HTTP object(or subclass) to use to * download pages. */ public HTMLPage(HTTP http) { this.http = http; } /** * Called to open a page and read it in. If null * is specified for the callback(回调), then the other * methods in this class may be used to look at * images, links and forms. * open是进入HTMLPage类的入口点。 * @param url The URL to read. * @param callback A callback class to handle the parse, or null * to use the built in one. * @exception java.io.IOException * @exception javax.swing.text.BadLocationException */ public void open(String url, HTMLEditorKit.ParserCallback callback) throws IOException,BadLocationException { http.send(url,null); base = url; processPage(callback); } /** * Internal function called to start the parse. * * @param callback The callback object to use. * @exception java.io.IOException */ protected void processPage(HTMLEditorKit.ParserCallback callback) throws IOException { /* * 创建一个字符串阅读器。 */ StringReader r = new StringReader(http.getBody()); /* * 创建新的解析器。 */ HTMLEditorKit.Parser parse = new HTMLParse().getParser(); /* * 程序检查是否提供了定制的回调类。如果提供了回调类,则 * 使用该会掉泪,从而结束了HTMLPage类的工作。如果没有提 * 供回调类,HTMLPage则使用内置的回调类,该回调类为Parser。 */ [1] [2] [3] if ( callback==null ) { HTMLPage.Parser p=new HTMLPage.Parser(); parse.parse(r,p,true); } else parse.parse(r,callback,false); } /** * Get the underlying HTTP object that was * sent to the constructor. * * @return The underlying HTTP object. */ public HTTP getHTTP() { return http; } /** * Get a list of all of the links from this page. * If this is to be used then null must have been * passed as the callback object to the open method. * * @return All links on this page. */ public Vector getLinks() { return links; } /** * Get a list of all of the images from this page. * If this is to be used then null must have been * passed as the callback object to the open method. * * @return A list of all of the images on this page. */ public Vector getImages() { return images; } /** * Get a list of all of the forms from this page. * If this is to be used then null must have been * passed as the callback object to the open method. * * @return A list of forms. */ public Vector getForms() { return forms; } /** * Called to perform a post for the specified form. * * @param form The form object to post. * @exception java.io.IOException */ public void post(HTMLForm form) throws IOException { http.getClientHeaders().set("Content-Type", "application/x-www-form-urlencoded"); http.send(form.getAction(),form.toString()); processPage(null); } /** * Get the URL that is represented by this page. * * @return The URL that is represented by this page. */ public String getURL() { return http.getURL(); } /** * Called internally to add an image to the list. * * @param img The image to add. */ protected void addImage(String img) { img = URLUtility.resolveBase(base,img); for ( int i=0;i<images.size();i ) { String s = (String)images.elementAt(i); if ( s.equalsIgnoreCase(img) ) return; } images.addElement(img); } /** * A HTML parser callback used by this class to * detect links, images and forms. * 定义一个名为Parser的内部类,该类实现一个专门的解析器回 * 调函数,用来跟踪超链接,图像和表单。 */ protected class Parser extends HTMLEditorKit.ParserCallback { /** * Used to build up data for an HTML form. */ protected HTMLForm tempForm; /** * Used to build up options for an HTML form. */ protected AttributeList tempOptions; /** * Used to build up options for an HTML form. */ protected Attribute tempElement = new Attribute(); /** * Holds the prompt text(just before or after a control. */ protected String tempPrompt = ""; /** * Holds the link till the end link is found */ protected Link tempLink; /** * Called to handle comments. * * @param data The comment. * @param pos The position. */ public void handleComment(char[] data,int pos) { } /** * Called to handle an ending tag. * * @param t The ending tag. * @param pos The position. */ public void handleEndTag(HTML.Tag t,int pos) { if ( t==HTML.Tag.OPTION ) { if ( tempElement!=null ) { tempElement.setName(tempPrompt); tempOptions.add(tempElement); tempPrompt = ""; } tempElement = null; } else if ( t==HTML.Tag.FORM ) { if ( tempForm!=null ) forms.addElement(tempForm); tempPrompt = ""; } else if ( t==HTML.Tag.A ) { if ( tempLink!=null ) tempLink.setPrompt(tempPrompt); tempPrompt = ""; } } /** * Called to handle an error. Not used. * * @param errorMsg The error. * @param pos The position. */ public void handleError(String errorMsg,int pos) { } 上一页 [1] [2] [3] /** * Called to handle a simple tag. * * @param t The simple tag. * @param a The attribute list. * @param pos The position. */ public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a,int pos) { handleStartTag(t,a,pos); } /** * Called to handle a starting tag. * * @param t The starting tag. * @param a The attribute list. * @param pos The position. */ public void handleStartTag(HTML.Tag t, MutableAttributeSet a,int pos) { String type = ""; // is it some sort of a link String href = (String)a.getAttribute(HTML.Attribute.HREF); if ( (href!=null) && (t!=HTML.Tag.BASE) ) { String alt = (String)a.getAttribute(HTML.Attribute.ALT); Link link = new Link( alt, URLUtility.resolveBase(base,href), null); links.addElement(tempLink=link); } else if ( t==HTML.Tag.OPTION ) { tempElement = new Attribute(); tempElement.setName(""); tempElement.setValue((String)a.getAttribute(HTML.Attribute.VALUE)); } else if ( t==HTML.Tag.SELECT ) { if ( tempForm==null ) return; tempOptions = new AttributeList(); tempForm.addInput( (String)a.getAttribute(HTML.Attribute.NAME), null, "select", tempPrompt, tempOptions); tempPrompt = ""; } else if ( t==HTML.Tag.TEXTAREA ) { if ( tempForm==null ) return; tempForm.addInput( (String)a.getAttribute(HTML.Attribute.NAME), null, "textarea", tempPrompt, null); tempPrompt = ""; } else if ( t==HTML.Tag.FORM ) { if ( tempForm!=null ) forms.addElement(tempForm); String action = (String)a.getAttribute(HTML.Attribute.ACTION); if ( action!=null ) { try { URL aurl = new URL(new URL(http.getURL()),action); action = aurl.toString(); } catch ( MalformedURLException e ) { action = null; } } tempForm = new HTMLForm( (String)a.getAttribute(HTML.Attribute.METHOD), action ); tempPrompt = ""; } else if ( t==HTML.Tag.INPUT ) { if ( tempForm==null ) return; if ( t!=HTML.Tag.INPUT ) { type = (String)a.getAttribute(HTML.Attribute.TYPE); if ( type==null ) return; } else type = "select"; if ( type.equalsIgnoreCase("text") || type.equalsIgnoreCase("edit") || type.equalsIgnoreCase("password") || type.equalsIgnoreCase("select") || type.equalsIgnoreCase("hidden") ) { tempForm.addInput( (String)a.getAttribute(HTML.Attribute.NAME), (String)a.getAttribute(HTML.Attribute.VALUE), type, tempPrompt, null); tempOptions = new AttributeList(); } } else if ( t==HTML.Tag.BASE ) { href = (String)a.getAttribute(HTML.Attribute.HREF); if ( href!=null ) base = href; } else if ( t==HTML.Tag.IMG ) { String src = (String)a.getAttribute(HTML.Attribute.SRC); if ( src!=null ) addImage(src); } } /** * Called to handle text. * * @param data The text. * @param pos The position. */ public void handleText(char[] data,int pos) { tempPrompt = new String(data) " "; } } } 上一页 [1] [2] [3]




欢迎光临 江西广告网 (http://bbs.jxadw.com/) Powered by Discuz! X3.2