nekohtml的2个小例子
//获取网页里面的keywords和descriptionpublic static void main(String[] argv) throws Exception {BufferedReader in = new BufferedReader(new FileReader("d:/163.html"));DOMParser parser = new DOMParser(); parser.setProperty( "http://cyberneko.org/html/properties/default-encoding", "gb2312");parser.parse(new InputSource(in));Document doc = parser.getDocument();NodeList list = doc.getElementsByTagName("META");for(int i = 0, n = list.getLength(); i < n ; i++){Element e = (Element) list.item(i);if(e.getAttribute("name").equalsIgnoreCase("keywords")){String keywords = e.getAttribute("content");System.out.println("keywords: " + keywords);}if(e.getAttribute("name").equalsIgnoreCase("description")){String description = e.getAttribute("content");System.out.println(description);}}} ==========================================================================//2、test使用DOMFragmentParser,提取所有正文,由于没有过滤一些没用的标签,所以会有没用的信息打印,这个可以再事先过滤一下。public static void main(String[] argv) throws Exception {DOMFragmentParser parser = new DOMFragmentParser();HTMLDocument document = new HTMLDocumentImpl();DocumentFragment fragment = document.createDocumentFragment();parser.parse("http://sports.sina.com.cn/f1/2009-09-21/20104599271.shtml", fragment);print(fragment, "");} /** Prints a node's class name. */public static void print(Node node, String indent) {//System.out.println(indent + node.getClass().getName());//System.out.println(node.getNodeType());if (node.getNodeType() == Node.TEXT_NODE) {System.out.println(indent + node.getNodeValue());}Node child = node.getFirstChild();while (child != null) {print(child, indent + " ");child = child.getNextSibling();}} }
页:
[1]