nekohtml的使用
<div style="border: 1px solid rgb(204, 204, 204); padding: 4px 5px 4px 4px; background-color: rgb(238, 238, 238); font-size: 13px; width: 98%;"><!----> import org.cyberneko.html.parsers.DOMFragmentParser;import org.apache.html.dom.HTMLDocumentImpl;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* 从html中抽取纯文本
*
* @param content
* @return
* @throws UnsupportedEncodingException
*/
public String extractTextFromHTML(String content)
throws UnsupportedEncodingException {
DOMFragmentParser parser = new DOMFragmentParser();
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
InputStream is = new ByteArrayInputStream(content.getBytes());
try {
parser.parse(new InputSource(is), node);
} catch (IOException e) {
e.printStackTrace();
} catch (SAXException se) {
se.printStackTrace();
}
StringBuffer newContent = new StringBuffer();
this.getText(newContent, node);
String str = (new String(
newContent.toString().getBytes("Windows-1252"), "GBK"));
return str;
}
private void getText(StringBuffer sb, Node node) {
if (node.getNodeType() == Node.TEXT_NODE) {
sb.append(node.getNodeValue());
}
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
getText(sb, children.item(i));
}
}
}
页:
[1]