爬(java):记一段爬取acfun视频及处理过程
来自:Pixiv 画师( Dakhuf )
- 开始准备
- 1.0简单模式
- 2.0反水:从头开始
- 小结
开始准备
导入爬虫模块的依赖:
<!--WebMagic-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<!--Jsoup-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
目标对象:
导航栏右侧,正文主体3块,排行榜3块的(标题,图片,视频链接)
1.0简单模式
①分析页面,结论:简单的静态页面布局。直接上手
②根据网站的页面布局获取目标标签和属性,这里使用xpath查找xml文档。
public class Pa {
public static class AcfunProcessor implements PageProcessor {
public void process(Page page) {
// 右侧*6
page.putField("rside_url_list", page.getHtml().xpath(
"//div[@class=\"fr slider-right-x6\"]/ul/li/a/@href").all());
page.putField("rside_img_list", page.getHtml().xpath(
"//div[@class=\"fr slider-right-x6\"]/ul/li/a/img/@src").all());
page.putField("rside_title_list", page.getHtml().xpath(
"//div[@class=\"fr slider-right-x6\"]/ul/li/a/img/@alt").all());
// 娱乐*8 ::11
page.putField("yl_url_list",page.getHtml().xpath(
"//div[@m-id=\"15\"]/figure/a/@href").all());
page.putField("yl_img_list",page.getHtml().xpath(
"//div[@m-id=\"15\"]/figure/a/img/@data-original").all());
page.putField("yl_title_list",page.getHtml().xpath(
"//div[@m-id=\"15\"]/figure/a/img/@alt").all());
// 游戏*8 ::11
page.putField("yx_url_list",page.getHtml().xpath(
"//div[@m-id=\"61\"]/figure/a/@href").all());
page.putField("yx_img_list",page.getHtml().xpath(
"//div[@m-id=\"61\"]/figure/a/img/@data-original").all());
page.putField("yx_title_list",page.getHtml().xpath(
"//div[@m-id=\"61\"]/figure/a/img/@alt").all());
// 动画*8 ::11
page.putField("dh_url_list",page.getHtml().xpath(
"//div[@m-id=\"29\"]/figure/a/@href").all());
page.putField("dh_img_list",page.getHtml().xpath(
"//div[@m-id=\"29\"]/figure/a/img/@data-original").all());
page.putField("dh_title_list",page.getHtml().xpath(
"//div[@m-id=\"29\"]/figure/a/img/@alt").all());
// rank1/2/3*8 ::10
page.putField("rank1_url_list",page.getHtml().xpath(
"//div[@m-id=\"17\"]//ul[@data-con=\"1\"]/li/a/@href").all());
page.putField("rank1_title_list",page.getHtml().xpath(
"//div[@m-id=\"17\"]//ul[@data-con=\"1\"]/li/a/@title").all());
page.putField("rank2_url_list",page.getHtml().xpath(
"//div[@m-id=\"63\"]//ul[@data-con=\"1\"]/li/a/@href").all());
page.putField("rank2_title_list",page.getHtml().xpath(
"//div[@m-id=\"63\"]//ul[@data-con=\"1\"]/li/a/@title").all());
page.putField("rank3_url_list",page.getHtml().xpath(
"//div[@m-id=\"31\"]//ul[@data-con=\"1\"]/li/a/@href").all());
page.putField("rank3_title_list",page.getHtml().xpath(
"//div[@m-id=\"31\"]//ul[@data-con=\"1\"]/li/a/@title").all());
}
private Site site = Site.me();
public Site getSite() {
return site;
}
}
}
③创建pa任务并分发多个持久化
public void paVideo() {
Spider.create(new Pa.AcfunProcessor())
.addUrl("https://www.acfun.cn/")
.addPipeline(new rsidepipeline())
.addPipeline(new ylpipeline())
.addPipeline(new yxpipeline())
.addPipeline(new dhpipeline())
.addPipeline(new rank1pipeline())
.addPipeline(new rank2pipeline())
.addPipeline(new rank3pipeline())
.run();
}
④持久化
#连接数据库
public class Jdbc {
private static final String URL = "jdbc:mysql://localhost:3306/bb";
private static final String USER = "root";
private static final String PASSWORD = "root";
private static Connection conn = null;
static {
try {
//1.加载驱动
Class.forName("com.mysql.jdbc.Driver");
//2.获得数据库的连接
conn = DriverManager.getConnection(URL, USER, PASSWORD);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static Connection getConnection() {
return conn;
}
}
(以其中一个任务为例)
public class ylpipeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
List<String> yl_url_list = resultItems.get("yl_url_list");
List<String> yl_img_list = resultItems.get("yl_img_list");
List<String> yl_title_list = resultItems.get("yl_title_list");
Connection conn = Jdbc.getConnection();
String sql = "INSERT INTO yl_video (title,url,img) VALUES (?,?,?)";
for (int i = 0; i <yl_title_list.size() ; i++) {
try {
PreparedStatement ptmt = conn.prepareStatement(sql);
ptmt.setString(1,yl_title_list.get(i));
ptmt.setString(2,"https://www.acfun.cn"+yl_url_list.get(i));
ptmt.setString(3,yl_img_list.get(i));
ptmt.execute();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
pa结果截图:
至此爬虫任务已经完成一半,下面就是进行视频解析与相应的数据处理了。But,第二天起来发现网站的页面代码改了,修改了xpath的获取路径,结果为null。显然pa多了,有了防范。
2.0反水:从头开始
①分析页面:
F12调试,发现页面中的div标签内并非为空,存在实际的内容。考虑js或者请求加载进来的(WebMagic获取的html页面是第一次加载的网站页面,其中不会包含js,请求加载进来的东西),通过查找页面发现几个可疑的js。
排行榜1展示的内容
推荐视频展示的内容
娱乐版块展示的内容
分别对应了页面的几块内容区。
②尝试爬取js内容:
public class PaByDocument {
//存储列表
List<String> rside_title_list = new ArrayList<>();
List<String> rside_img_list = new ArrayList<>();
List<String> rside_url_list = new ArrayList<>();
List<String> yl_title_list = new ArrayList<>();
List<String> yl_img_list = new ArrayList<>();
List<String> yl_url_list = new ArrayList<>();
List<String> rank1_title_list = new ArrayList<>();
List<String> rank1_url_list = new ArrayList<>();
//获取到rside,yl,rank1数据列表
public void JsoupParse() throws IOException {
//匹配规则
String regex_title = "alt=\\\\\\\"(\\S+)\\\\\\\"";
String regex_img = "src=\\\\\\\"(\\S+)\\\\\\\"";
String regex_url = "/v/ac[0-9]+";
List<String> regex_list = new ArrayList<>();
regex_list.add(regex_title);
regex_list.add(regex_img);
regex_list.add(regex_url);
//获取html document
Document document = Jsoup.parse(new URL("https://www.acfun.cn"), 6000);
String doc = document.toString();
//------------------------------------------------- rside切分*6---------------------------------------------------------------
String doc_k1 = doc.substring(doc.indexOf("slider-right-images"), doc.indexOf("window.sliderData"));
String[] rside = doc_k1.split("recommend-video log-item");
//匹配rside特征,保存
for (int i = 1; i < rside.length; i++) {
for (int j = 0; j < regex_list.size(); j++) {
Pattern p = Pattern.compile(regex_list.get(j));
Matcher m = p.matcher(rside[i]);
if (m.find()) {
if (j == 0) {
rside_title_list.add(m.group(1));
} else if (j == 1) {
rside_img_list.add(m.group(1));
} else if (j == 2) {
rside_url_list.add(m.group());
}
} else {
// System.out.println("特征匹配失败:第"+(i+1)+"分块--第"+(j+1)+"匹配");
if (j == 0)
rside_title_list.add("特征匹配失败");
}
}
}
//rside匹配失败后处理
for (int i = 0; i < rside_title_list.size(); i++) {
if (rside_title_list.get(i) == "特征匹配失败") {
rside_title_list.set(i, rside_url_list.get(i).split("/v/")[1]);
}
}
//-----------------------------------------------yl切分*12------------------------------------------------------
String[] doc_k2 = doc.split("big-image");
String[] yl = doc_k2[1].split("normal-video-container");
// 匹配yl特征,保存
for (int i = 0; i < yl.length; i++) {
for (int j = 0; j < regex_list.size(); j++) {
Pattern p = Pattern.compile(regex_list.get(j));
Matcher m = p.matcher(yl[i]);
if (m.find()) {
if (j == 0) {
yl_title_list.add(m.group(1));
} else if (j == 1) {
yl_img_list.add(m.group(1));
} else if (j == 2) {
yl_url_list.add(m.group());
}
} else {
if (j == 0)
yl_title_list.add("特征匹配失败");
}
}
}
//yl匹配失败后处理
for (int i = 0; i < yl_title_list.size(); i++) {
if (yl_title_list.get(i) == "特征匹配失败") {
yl_title_list.set(i, yl_url_list.get(i).split("/v/")[1]);
}
}
//-----------------------------------------------rank1切分*10------------------------------------------------------
String doc_k3 = doc.substring(doc.indexOf("list-content-videos active"),doc.indexOf("class=\\\"list-content-videos\\\""));
String[] rank1 = doc_k3.split("video-title text-overflow");
regex_list.remove(1);
regex_list.set(0,"title=\\\\\\\"(\\S+)
");
for (int i = 1; i < rank1.length; i++) {
for (int j = 0; j < regex_list.size(); j++) {
Pattern p = Pattern.compile(regex_list.get(j));
Matcher m = p.matcher(rank1[i]);
if (m.find()) {
if (j == 0) {
rank1_title_list.add(m.group(1).split("
")[0]);
} else if (j == 1) {
rank1_url_list.add(m.group());
}
} else {
if (j == 0)
rank1_title_list.add("特征匹配失败");
}
}
}
for (int i = 0; i < rank1_title_list.size(); i++) {
if (rank1_title_list.get(i) == "特征匹配失败") {
rank1_title_list.set(i, rank1_url_list.get(i).split("/v/")[1]);
}
}
}
}
通过对字符串的一顿操作,用正则取出目标内容。可以正确的打印相应的内容。当然由于一些小的符号不太好用正则处理,对于title字段异常处理为url的acxxxxxx。
③再次分析:
以上3个分区的内容拿到了,其他分区的内容如何获取?再次F12调试:在xhr请求中发现可疑请求,名为:
不难看出其中包含了所有分区关键词。拿到它的请求链接放入浏览器地址栏,直接下载出了一个文件,用文本编辑器打开:
json格式,内容非常的长。通过搜索几个关键词确定,里面包含了页面几乎所有内容。
④建立http链接,直接拿xhr文件
//获取xhr文本
public String getXhr() throws IOException {
String url="https://www.acfun.cn/?pagelets=pagelet_game,pagelet_douga,pagelet_bangumi_list,pagelet_life,pagelet_tech,pagelet_dance,pagelet_music,pagelet_film,pagelet_fishpond,pagelet_sport&reqID=0&ajaxpipe=1&t=1576228936457";
Connection con = Jsoup.connect(url);
//请求头设置,特别是cookie设置
con.header("Accept", "text/html, application/xhtml+xml, */*");
con.header("Content-Type", "application/x-www-form-urlencoded");
con.header("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0))");
con.ignoreContentType(true);
// con.header("Cookie", cookie);
Document doc=con.get();
String xhr=doc.toString().replaceAll("\\"","");
return xhr;
}
处理字符串,拿到数据。
//获取到rank2-3,yx,dh的数据列表
public void XhrParse(String xhr){
regex_list.add(0,"alt=\\\"\\\\(\\S+)[\\u3002|\\uff1f|\\uff01|\\uff0c|\\u3001|\\uff1b|\\uff1a|\\u201c|\\u201d|\\u2018|\\u2019|\\uff08|\\uff09|\\u300a|\\u300b|\\u3008|\\u3009|\\u3010|\\u3011|\\u300e|\\u300f|\\u300c|\\u300d|\\ufe43|\\ufe44|\\u3014|\\u3015|\\u2026|\\u2014|\\uff5e|\\ufe4f|\\uffe5]" );
regex_list.add(1,"src=\\\"\\\\(\\S+)\\\\\\\"");
regex_list.add(2,"/v/ac[0-9]+");
//-------------------------------------------dh-----------------------------------------------------
String xhr_dh = xhr.substring(xhr.indexOf("video-list-6"));
xhr_dh = xhr_dh.substring(0, xhr_dh.indexOf("ranked-list"));//dh
String[] dh = xhr_dh.split("cover");
for (int i = 1; i < dh.length; i++) {
for (int j = 0; j < regex_list.size(); j++) {
Pattern p = Pattern.compile(regex_list.get(j));
Matcher m = p.matcher(dh[i]);
if (m.find()) {
if (j == 0) {
dh_title_list.add(m.group(1).replaceAll( "[\\p{P}+~$`^=|<>\\[\\]<>《》]" , ""));
} else if (j == 1) {
dh_img_list.add(m.group(1));
}else if (j==2){
dh_url_list.add(m.group());
}
} else {
if (j == 0)
dh_title_list.add("特征匹配失败");
}
}
}
for (int i = 0; i < dh_title_list.size(); i++) {
if (dh_title_list.get(i) == "特征匹配失败") {
dh_title_list.set(i, dh_url_list.get(i).split("/v/")[1]);
}
}
//-------------------------------------------------yx-------------------------------------------------------
String xhr_yx = xhr.substring(xhr.indexOf("video-list-19"));
xhr_yx = xhr_yx.substring(0, xhr_yx.indexOf("ranked-list"));//yx
String[] yx = xhr_yx.split("cover");
for (int i = 1; i < yx.length; i++) {
for (int j = 0; j < regex_list.size(); j++) {
Pattern p = Pattern.compile(regex_list.get(j));
Matcher m = p.matcher(yx[i]);
if (m.find()) {
if (j == 0) {
yx_title_list.add(m.group(1).replaceAll( "[\\p{P}+~$`^=|<>\\[\\]<>《》]" , ""));
} else if (j == 1) {
yx_img_list.add(m.group(1));
}else if (j==2){
yx_url_list.add(m.group());
}
} else {
if (j == 0)
yx_title_list.add("特征匹配失败");
}
}
}
for (int i = 0; i < yx_title_list.size(); i++) {
if (yx_title_list.get(i) == "特征匹配失败") {
yx_title_list.set(i, yx_url_list.get(i).split("/v/")[1]);
}
}
//-------------------------------------------------rank2 生活------------------------------------------------------
regex_list.remove(1);
regex_list.set(1,"data-atomid=\\\"\\\\([0-9]+)\\\\\\\"");
String xhr_rank2 = xhr.substring(xhr.indexOf("class=\"\\ranked-list\\\""),xhr.indexOf("class=\"\\more\\\""));
String[] rank2 = xhr_rank2.split("block-right");
for (int i = 0; i < rank2.length; i++) {
for (int j = 0; j < regex_list.size(); j++) {
Pattern p = Pattern.compile(regex_list.get(j));
Matcher m = p.matcher(rank2[i]);
if (m.find()) {
if (j == 0) {
rank2_title_list.add(m.group(1).replaceAll("[\\p{P}+~$`^=|<>\\[\\]<>《》]",""));
} else if (j == 1) {
rank2_url_list.add("/v/ac"+m.group(1).split("\\\\")[0]);
}
} else {
if (j == 0)
rank2_title_list.add("特征匹配失败");
}
}
}
for (int i = 0; i < rank2_title_list.size(); i++) {
if (rank2_title_list.get(i) == "特征匹配失败") {
rank2_title_list.set(i, rank2_url_list.get(i).split("/v/")[1]);
}
}
//------------------------------------------------rank3 ---------------------------------------------------------
String xhr_rank3 = xhr.split("class=\\\"\\\\more\\\\\\\"")[1];
String[] rank3 = xhr_rank3.split("block-right");
rank3[0] = rank3[0].substring(rank3[0].indexOf("ranked-list-content"));
for (int i = 0; i < rank3.length; i++) {
for (int j = 0; j < regex_list.size(); j++) {
Pattern p = Pattern.compile(regex_list.get(j));
Matcher m = p.matcher(rank3[i]);
if (m.find()) {
if (j == 0) {
rank3_title_list.add(m.group(1).replaceAll("[\\p{P}+~$`^=|<>\\[\\]<>《》]",""));
} else if (j == 1) {
rank3_url_list.add("/v/ac"+m.group(1).split("\\\\")[0]);
}
} else {
if (j == 0)
rank3_title_list.add("特征匹配失败");
}
}
}
for (int i = 0; i < rank3_title_list.size(); i++) {
if (rank3_title_list.get(i) == "特征匹配失败") {
rank3_title_list.set(i, rank3_url_list.get(i).split("/v/")[1]);
}
}
}
}
⑤发现数据有效,持久化(略)