JAVA 爬虫问题

waytoexplorer

2016-11-02 09:44:28 +08:00

这个 sessionid 可能会定时变更，爬取的时候也要相应更改

CharlesL

2016-11-02 10:23:37 +08:00

把模拟登录之后的 cookie 也给 webmagic 带上试试

Jobin0528

2016-11-02 10:51:53 +08:00

@waytoexplorer 请问这个怎么操作？

Jobin0528

2016-11-02 10:52:03 +08:00

@CharlesL 带上了

joechan

2016-11-02 12:20:42 +08:00

应该是你带上的 sessionid 还是每次都不同的

winglight2016

2016-11-02 15:31:50 +08:00

你用浏览器访问一下，看看登录请求及返回的所有 header 和 response ，肯定是漏了什么地方没模拟对

Jobin0528

2016-11-02 15:36:54 +08:00

```
@RequestMapping(value = "doGrab",method = RequestMethod.POST)
public String doGrab(String username, String password, HttpServletRequest request){
try {
String cookie = simulationHttpUtil.getCookie(username,password);
String cookies[] = cookie.split("=");
webMagicUtil.setSite(cookies[1]);
webMagicUtil.setCook(cookie);
Spider.create(webMagicUtil)
//从该网页开始抓
.addUrl("http://www.digifilm.com.cn/index.php/member/index")
.addPipeline(new ConsolePipeline())
//开启 5 个线程抓取
.thread(5)
//启动爬虫
.run();
System.out.print(webMagicUtil.getLength());

} catch (Exception e) {
e.printStackTrace();
}

@Component
//给爬虫供给 cookie 的方法
public String getCookie (String username,String password) throws Exception {
RequestConfig requestConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.BEST_MATCH).build();//标准 cookie 策略 /*.STANDARD_STRICT*/
CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();//设置进去

HttpGet getHomePage = new HttpGet("http://www.digifilm.com.cn/index.php/public/login");
getHomePage.setHeader("Accept","text/html,application/xhtml+xml,image/jxr,*/*");
getHomePage.setHeader("Accept-Encoding","gzip,deflate");
getHomePage.setHeader("Accept-Language","zh-CN");
getHomePage.setHeader("Connection","Keep-Alive");
getHomePage.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393");
//填充登陆请求中基本的参数
CloseableHttpResponse response = httpClient.execute(getHomePage);
String rec = setCookie(response);
//printResponse(response);
//首页的源码
String responseHtml = EntityUtils.toString(response.getEntity());
//首页中的 html 代码<input name="__hash__" type="hidden" value=""/>
String hashValue = responseHtml.split("<input type=\"hidden\" name=\"__hash__\" value=\"")[1].split("\" />")[0];
response.close();
List<NameValuePair> valuePairs = new LinkedList<NameValuePair>();
valuePairs.add(new BasicNameValuePair("__hash__" , hashValue));
valuePairs.add(new BasicNameValuePair("password", password));
valuePairs.add(new BasicNameValuePair("username", username));
while (true){
//获取验证码"
HttpGet getCaptcha = new HttpGet("http://www.digifilm.com.cn/index.php/Verify/verify/?rand=" + Math.random());
CloseableHttpResponse imageResponse = httpClient.execute(getCaptcha);
//把响应的 png 格式图片转换成 jpg 格式。
InputStream in = imageResponse.getEntity().getContent();
BufferedImage bufferedImage = imageUtil.imageChange(in);
imageResponse.close();
in.close();
//图片去噪
File file = imageUtil.cleanImage(bufferedImage);
//识别去噪后的图片
String text = scanCodeUtil.recognizeText(file);
System.out.println("扫描后的图片:"+text);
valuePairs.add(new BasicNameValuePair("verify", text));
//完成登陆请求的构造
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(valuePairs, Consts.UTF_8);
HttpPost post = new HttpPost("http://www.digifilm.com.cn/index.php/public/checklogin");
post.setEntity(entity);
CloseableHttpResponse httpResponse = httpClient.execute(post);//登录并返回响应对象
httpResponse.close();
//构造一个 get 请求，用来测试登录 cookie 是否拿到
HttpGet g = new HttpGet("http://www.digifilm.com.cn/index.php/member/index");//获取登录后页面
//将 cookie 注入到 get 请求头当中。未得到 cookie 就会把请求头里的 cookie 清空。造成失败。
//可关闭的响应对象。
CloseableHttpResponse r = httpClient.execute(g);

Header headers= r.getFirstHeader("Content-Length");
Integer contentLength = Integer.parseInt(headers.getValue());
if(contentLength > 7000){
r.close();
break;
}
r.close();
}
//httpClient.close();
String rec2 = rec.split(";")[2];
return rec2;
}
}

@Component
public class WebMagicUtil implements PageProcessor {

private int length;
//部分一：抓取网站的相关配置，包括编码、抓取间隔、重试次数、超时时间等
private Site site ;
public void setSite(String cookie) {
this.site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(1000*60*60).setCycleRetryTimes(3)
//添加 cookie 之前一定要先设置主机地址，否则 cookie 信息不生效
.setDomain("www.digifilm.com.cn")
//添加获取的 cookie 信息;
.addCookie("PHPSESSID",cookie)
//添加请求头，网站会根据请求头判断该请求是由浏览器发起还是爬虫发起。
.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393")
.addHeader("Accept","text/html, application/xhtml+xml, image/jxr, */*")
.addHeader("Accept-Encodin","gzip,deflate")
.addHeader("Accept-Language","zh-CN")
.addHeader("Connection","Keep-Alive");
//.addHeader("Referer","http://www.digifilm.com.cn/index.php/public/login");;
}

@Override
//process 是定制爬虫逻辑的核心接口，在这里编写抽取逻辑
public void process(Page page) {
//在开始的页面抓（去到密钥列表和单个下载页面的连接）
if(page.getUrl().regex("(.*/index\\.php/member/index)").match()) {
page.addTargetRequests(page.getHtml().xpath("//div[@class=leaguer]").links().regex("(.*/index\\.php/(\\w+)_down/index)").all());
}
//在密钥列表页面抓（列表页码和单个下载页面的链接）
if (page.getUrl().regex("(.*/index\\.php/(\\w+)_down/index)").match()) {
page.addTargetRequests(page.getHtml().xpath("//div[@class='SMAMiddle SMAMiddlelb']").links().regex("(.*/index\\.php/(\\w+)_down/content/id/.*)").all());
//翻页链接
page.addTargetRequests(page.getHtml().xpath("//div[@class=fanye_1]").links().regex("(.*/index\\.php/(\\w+)_down/index\\?&p=\\d+)").all());
}
//在密钥单个下载页面抽取信息。
if (page.getUrl().regex("(.*/index\\.php/(\\w+)_down/content/id/\\w+)").match()){
page.putField("filmTitle", page.getHtml().xpath("//div[@class='videoDescri']/span[1]/text()"));
page.putField("filmSchedule", page.getHtml().xpath("//div[@class='videoDescri']/span[2]/text()"));
page.putField("filmType", page.getHtml().xpath("//div[@class='videoDescri']/span[3]/text()"));
page.putField("secretKey", page.getHtml().xpath("//div[@class='SMAMiddle SMAMiddlela']/ul/li/a[@class='load']").links().regex("(.*/download\\.php\\?mid=.*)").all());
List<String> list = page.getResultItems().get("secretKey");
for (String url: list) {
try {
System.out.println(url);
downloadFromUrl(url,"C:\\360Downloads\\Test\\");
} catch (Exception e) {
e.printStackTrace();
}
}
length++;
}
}

//测试下载代码
public static String downloadFromUrl(String url,String dir) {
try {
URL httpurl = new URL(url);
String fileName = getFileNameFromUrl(url);
System.out.println(fileName);
File saveDir = new File(dir);
if (!saveDir.exists()) {
saveDir.mkdir();
}
File file = new File(saveDir + File.separator + fileName);

file.createNewFile();
FileUtils.copyURLToFile( httpurl,file);
} catch (Exception e) {
e.printStackTrace();
return "Fault";
}
return "Successful!";
}

public static String getFileNameFromUrl(String url) {
String name = new Long(System.currentTimeMillis()).toString() + ".xml";
return name;
}

```

Jobin0528

2016-11-02 18:32:10 +08:00

@livid 可以帮忙删除这条代码么？

zoran

2016-11-02 18:36:14 +08:00

java 爬虫可以试试 https://github.com/zhuoran/crawler4j 记得点赞~~ ：）

Yc1992

2016-11-02 18:37:04 +08:00

是不是网站设置了单点登录