|
<div class="blog_content">在我整理完在线听歌(http://ting.faqee.com/)的模块后,剩下来的工作就是如何通过程序抓取最新的Baidu好听的歌曲,抓取的工作主要包括3个属性:歌名、歌曲在线播放地址和歌词内容(符合LRC歌词格式),目前完成歌曲和歌曲地址抓取,由于百度的歌曲地址很多通过js获取,所以歌曲地址获取我这里使用搜狗音乐搜索方便些,所有的源码如下:
package com.common.utils;import java.io.BufferedReader;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import java.io.UnsupportedEncodingException;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.net.URLDecoder;import java.net.URLEncoder;import java.util.ArrayList;import java.util.HashSet;import java.util.List;import java.util.Set;import java.util.TreeSet;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.filters.OrFilter;import org.htmlparser.nodes.TextNode;import org.htmlparser.tags.LinkTag;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import com.common.doc.FileOperUtils;class Song{private String name;private String url;private String lrc;public Song(String name,String url){this.name = name;this.url = url;this.lrc = "";}public String getName() {return name;}public void setName(String name) {this.name = name;}public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}public String getLrc() {return lrc;}public void setLrc(String lrc) {this.lrc = lrc;}}public class BaiduMp3 {public static String visitURL(String strUrl) {URL url = null;try {url = new URL(strUrl);} catch (MalformedURLException e) {e.printStackTrace();}URLConnection conn = null;try {conn = url.openConnection();conn.setDoOutput(true);} catch (IOException e) {System.out.println("e:"+e.getMessage());}OutputStreamWriter out;try {out = new OutputStreamWriter(conn.getOutputStream(), "GBK");out.flush();out.close();} catch (UnsupportedEncodingException e2) {e2.printStackTrace();} catch (IOException e2) {e2.printStackTrace();}// 接收返回信息BufferedReader rd = null;try {rd = new BufferedReader(new InputStreamReader(conn.getInputStream()));return rd.readLine();} catch (IOException e1) {e1.printStackTrace();}return "";}/*** 功能说明:访问指定的URL并检查返回结果。* @param strUrl* @param successFlag 请求成功的标识,比如包含“_SUCCESS”字。* @return*/public static String visitURL(String strUrl, String successFlag) { boolean rs = false; HttpURLConnection jconn = null; ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); try { URL url = new URL(strUrl); jconn = (HttpURLConnection) url.openConnection(); jconn.setDoOutput(true); jconn.setDoInput(true); jconn.connect(); InputStream in = jconn.getInputStream(); byte[] buf = new byte[4096]; int bytesRead; while ((bytesRead = in.read(buf)) != -1) { byteArrayOutputStream.write(buf, 0, bytesRead); } String strRead = new String(byteArrayOutputStream.toByteArray(),"GBK"); return strRead; } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { jconn.disconnect(); try { byteArrayOutputStream.close(); } catch (IOException e) { e.printStackTrace(); } } return "";}private static boolean isTrimEmptyOrBlank(String astr) {if ((null == astr) || (astr.length() == 0) || "&nbsp;".equals(astr)) {return true;}astr = astr.trim();if ((null == astr) || (astr.length() == 0)) {return true;}return false;}private static String getFilteredContent(String htmlContent, String reg,int i) {String content = "";int k=1;Pattern pp = Pattern.compile(reg, Pattern.DOTALL);Matcher m = pp.matcher(htmlContent);while (m.find()) {content = m.group();if(k++==i)break;}return content;}public static List<Song> getBaiduSongs(){List<Song> ss = new ArrayList();String htmlContent = visitURL("http://list.mp3.baidu.com/topso/mp3topsong.html?id=1?top2","s");String encode = "GBK";//System.out.println("===========================================================================");//System.out.println(htmlContent);//System.out.println("===========================================================================");String reg = "<table width=\"100%\" align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"list\">(.*?)</table>";htmlContent = getFilteredContent(htmlContent,reg,0);//FileOperUtils.writeFile("c:\\1.html", htmlContent, false);String line = "",lineurl="";Node anode = null;TextNode textnode = null;try {Parser parser = Parser.createParser(htmlContent, encode);NodeClassFilter textFilter = new NodeClassFilter(LinkTag.class);OrFilter lastFilter = new OrFilter();lastFilter.setPredicates(new NodeFilter[] { textFilter });NodeList nodeList = parser.parse(lastFilter);Node[] nodes = nodeList.toNodeArray();for (int i = 0; i < nodes.length; i++) {anode = (Node) nodes;if(anode instanceof LinkTag){LinkTag txt = (LinkTag)anode;line = txt.getLinkText();if(txt.getPreviousSibling()!=null){if(txt.getPreviousSibling().toString().indexOf("(")>=0)continue;}line = txt.getLinkText();lineurl = txt.getAttribute("href");//System.out.println(txt.getLink());}if (isTrimEmptyOrBlank(line)||isTrimEmptyOrBlank(lineurl))continue;ss.add(new Song(line,getSongURL(line)));}} catch (ParserException pe) {pe.printStackTrace();}return ss;}private static String getSongURL(String songname){try {String ss = URLEncoder.encode(songname,"GBK");String htmlContent = visitURL("http://so.mp3.qihoo.com/?type=0&src=s&kw="+ss,"s");String encode = "GBK";String reg = "<table width=\"100%\" border=\"0\" cellspacing=\"0\" cellpadding=\"0\">(.*?)</table>";htmlContent = getFilteredContent(htmlContent,reg,1);String line = "",lineurl="";Node anode = null;TextNode textnode = null;Parser parser = Parser.createParser(htmlContent, encode);NodeClassFilter textFilter = new NodeClassFilter(LinkTag.class);OrFilter lastFilter = new OrFilter();lastFilter.setPredicates(new NodeFilter[] { textFilter });NodeList nodeList = parser.parse(lastFilter);Node[] nodes = nodeList.toNodeArray();for (int i = 0; i < nodes.length; i++) {anode = (Node) nodes;if(anode instanceof LinkTag){LinkTag txt = (LinkTag)anode;line = txt.getLinkText();lineurl = txt.getAttribute("href");if(!isTrimEmptyOrBlank(lineurl) && lineurl.startsWith("down.html")){String s = getFilteredContent(lineurl,"u=(.*?)\\&",0);if(!s.equals("")&&s.length()>5){s = Utils.replace(s, "u=", "");s = Utils.replace(s, "&", "");s = URLDecoder.decode(s,"GBK");return s;}}}}} catch (Exception pe) {pe.printStackTrace();}return "";}public static void main(String[] args) throws Exception{List<Song> ss = getBaiduSongs();int idx = 0;for(Song s:ss){System.out.println((++idx)+":"+s.getName()+"->"+s.getUrl());}//String ss = getSongURL("国家");//System.out.println(ss);//String s = URLDecoder.decode("http%3A%2F%2Fwww.yuelin.net%2F%B9%FA%BC%D2.mp3","GBK");//System.out.println(s);}} |
|