抓取网站的畅销商品,一个页面20个商品,抓取100个等的实例
1、解析器package com.yihaodian.pis.crawler;import java.util.ArrayList;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.lang.StringUtils;import org.apache.log4j.Logger;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.filters.HasAttributeFilter;import org.htmlparser.tags.Bullet;import org.htmlparser.tags.BulletList;import org.htmlparser.tags.Div;import org.htmlparser.tags.ImageTag;import org.htmlparser.tags.LinkTag;import org.htmlparser.tags.ScriptTag;import org.htmlparser.tags.Span;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import com.yihaodian.pis.dto.BestSellerDto;public class SuningPageParser extends PageParser{private static final Logger logger = Logger.getLogger(SuningPageParser.class);public SuningPageParser(String html, String charset) {super(html, charset);// TODO Auto-generated constructor stub}@Overridepublic List<BestSellerDto> extractBestSeller(String bestSellerHtml)throws ParserException {List<BestSellerDto> sellers = new ArrayList<BestSellerDto>(); Parser parser = Parser.createParser(bestSellerHtml, charset); NodeFilter filter = new HasAttributeFilter("class", "product_list02 profix02 clearfix"); NodeList nodeList = parser.extractAllNodesThatMatch(filter); if (nodeList == null || nodeList.size() == 0) { return null; } Node div = nodeList.elementAt(0); NodeList divchildren = div.getChildren(); BulletList ul = (BulletList)divchildren.elementAt(1); NodeList children = ul.getChildren(); BestSellerDto bestSeller = null; for (int i = 0; i < children.size(); i++) { bestSeller = new BestSellerDto(); Node child = children.elementAt(i); if (child instanceof Bullet) { Bullet li = (Bullet) child; Span nameDiv = (Span) findTagByClassName(li, "pro_intro"); //寻找tagName是 LinkTag 的那个 LinkTag link = (LinkTag)findTagByName(nameDiv,"LinkTag"); bestSeller.setName(link.getLinkText()); bestSeller.setUrl("http://www.suning.cn"+link.getLink()); Span pricespan = (Span) findTagByClassName(li, "pro_price"); String priceteString =pricespan.getChildrenHTML().replace("<em>", "").replace("</em>", "").replace("¥", ""); bestSeller.setPrice(priceteString); logger.info("畅销单品:" + bestSeller); sellers.add(bestSeller); } else { continue; } } return sellers;}@Overridepublic String extractNextPageUrlPattern(String bestSellerHtml)throws ParserException {String nextPageUrl="";Parser parser = Parser.createParser(bestSellerHtml, "utf-8");NodeFilter filter = new HasAttributeFilter("type", "text/javascript");; NodeList children = parser.extractAllNodesThatMatch(filter); if (children == null || children.size() == 0) { System.out.println("没有值"); }else{ System.out.println("有值"); } for (int i = 0; i < children.size(); i++) { ScriptTag child = (ScriptTag) children.elementAt(i);if(child.findPositionOf("&currentPage=")==0){String putInCart1 = null;String putInCart2 = null;//Pattern pattern2 = Pattern.compile("(?<=currentPage[)] \\{)([^\\}]*?)(?=\\})");Pattern pattern2 = Pattern.compile("(?<=var[ \\s]{0,100}(jumpUrl)[\\s]{0,100}[=][\\s]{0,100}[\"])(.*?)(?=\"\\s{0,100}[+])");Pattern pattern1 = Pattern.compile("(?<=var[ \\s]{0,100}dfy\\s{0,100}=\\s{0,100}[\"])(.*?)(?=[\"][\\s]{0,100})");Matcher matcher1 = pattern1.matcher(child.getChildrenHTML());if (matcher1.find()) {putInCart1 = matcher1.group(0).trim();}Matcher matcher2 = pattern2.matcher(child.getChildrenHTML());if (matcher2.find()) {putInCart2 = matcher2.group(0).trim();}//System.out.println(putInCart2.substring(15, putInCart2.indexOf(" + dfy")-1));nextPageUrl=putInCart2+putInCart1;} } // parser = Parser.createParser(bestSellerHtml, "utf-8"); //得到当前页currentPage// String currentPage="";// filter = new HasAttributeFilter("class", "on");// children = parser.extractAllNodesThatMatch(filter); //LinkTag dLinkTag = (LinkTag) children.elementAt(0); //System.out.println(dLinkTag.getLinkText());// for (int i = 0; i < children.size(); i++) {//Node node =children.elementAt(i);//if (node.getChildren().size()<2) {//LinkTag dLinkTag = (LinkTag)node;//if(dLinkTag.getLink().equals("#"))//currentPage= dLinkTag.getLinkText();//}//} nextPageUrl+="&ip_sortBy=salevolumn0&sortType=4&currentPage=";// + currentPage; logger.info("畅销榜下一页URL模式:" + nextPageUrl); return nextPageUrl;}@Overridepublic String getNextPageUrl(String nextPageUrlPattern, int pageNum) { StringBuilder sb = new StringBuilder(); sb.append("http://www.suning.cn/webapp/wcs/stores/servlet/"); sb.append(nextPageUrlPattern+(pageNum-1)); String nextPageUrl = sb.toString(); return nextPageUrl;}@Overridepublic String extractName(Map<String, String> params)throws ParserException {// TODO Auto-generated method stubreturn null;}@Overridepublic String extractPrice(Map<String, String> params)throws ParserException {// TODO Auto-generated method stubreturn null;}@Overridepublic String extractBrand(Map<String, String> params)throws ParserException {// TODO Auto-generated method stubreturn null;}@Overridepublic String extractImageUrl(Map<String, String> params)throws ParserException {// TODO Auto-generated method stubreturn null;}@Overridepublic boolean hasProduct(Map<String, String> params)throws ParserException {// TODO Auto-generated method stubreturn false;}} 2、主程序DAO
public List<BestSellerDto> fetchBestSeller(Integer id, int amount) { List<BestSellerDto> bestSellers = new ArrayList<BestSellerDto>(); if (amount <= 0) { logger.warn("畅销品数目不能为负数!"); return null; } SiteCategoryDto siteCategory = getSiteCategoryById(id); // 抓取参数 Map<String, String> params; // 组织抓取参数与页面参数 params = pageParamItemDao.getPageConfigBySite(siteCategory.getSiteId()); params.putAll(crawlerParamItemDao.getCrawlConfigBySite(siteCategory. getSiteId())); // 构建抓取对象 String charset = params.get(PageParamNames.CONTENT_ENCODING); int pageSize = Integer.parseInt(params.get(PageParamNames.BS_PAGE_SIZE)); int pages = amount / pageSize + 1; logger.info("畅销榜页数:" + pages); Crawler crawler = new Crawler(charset); String bestSellerHtml = null; PageParser pageParser = null; try { String categoryUrl = siteCategory.getCategoryUrl(); if (categoryUrl == null || categoryUrl.equals("")) { logger.info("此分类不支持畅销榜!"); return null; } bestSellerHtml = crawler.crawl(siteCategory.getCategoryUrl()); pageParser = PageParserFactory.createPageParser(null, charset, params); String nextPageUrlPattern=pageParser.extractNextPageUrlPattern(bestSellerHtml); if (siteCategory.getCategoryUrl().indexOf("suning")>0) { String nextPageUrl0 = pageParser.getNextPageUrl( nextPageUrlPattern, 1); bestSellerHtml = crawler.crawl(nextPageUrl0);} bestSellers = pageParser.extractBestSeller(bestSellerHtml); //if(amount<=bestSellers.size()) return bestSellers; if (nextPageUrlPattern != null) { for (int pageNum = 2; pageNum <= pages; pageNum++) { logger.info("抓取畅销榜第 " + pageNum + " 页"); String nextPageUrl = pageParser.getNextPageUrl( nextPageUrlPattern, pageNum); bestSellerHtml = crawler.crawl(nextPageUrl); List<BestSellerDto> moreBestSellers = pageParser. extractBestSeller(bestSellerHtml); if (moreBestSellers == null || moreBestSellers.isEmpty()) { break; } if (bestSellers.get(bestSellers.size() - 1).getUrl().equals( moreBestSellers.get(moreBestSellers.size() - 1). getUrl())) { break; } if (moreBestSellers != null && !moreBestSellers.isEmpty()) { bestSellers.addAll(moreBestSellers); } } } } catch (IOException e) { logger.error("抓取 " + siteCategory.getCategoryName() + " 畅销榜时出现异常!" + "URL为:" + siteCategory.getCategoryUrl(), e); } catch (ParserException e) { logger.error("解析畅销榜页面时出现异常!" + "URL为:" + siteCategory.getCategoryUrl(), e); } logger.info("++++++++++++++++++++++++++++++++++"); logger.info("畅销集合大小:" + (bestSellers == null ? 0 : bestSellers.size())); if (bestSellers != null) { if (bestSellers.size() <= amount) { return bestSellers; } List<BestSellerDto> subList = bestSellers.subList(0, amount); logger.info("----------------------------------"); logger.info("畅销集合大小:" + subList.size()); return subList; } else { return new ArrayList<BestSellerDto>(); } } 不懂的联系QQ526151410
页:
[1]