封装爬虫模块index.js
const puppeteer = require("puppeteer");
function timeout() {
return new Promise((resolve, reject) => {
setTimeout(resolve, 2000);
});
}
module.exports = async (url,callback) => {
//打开浏览器chrome 路由
const browser = await puppeteer.launch({
executablePath: "/Applications/Google\ Chrome.app/"+
"Contents/MacOS/Google\ Chrome",
headless: "new" // 添加这一行
});
// 创建自页面
const page = await browser.newPage();
// 打开一个网页
await page.goto(url, {
waitUtil: "networkidle2"
});
await timeout();
// 在页面中执行js
let results = await page.evaluate(callback);
// 关闭Chrome
await browser.close();
// 返货js执行结构
return results;
}
爬去数据
const puppeteer = require("./index")
const { SingleBar, Presets } = require('cli-progress');
const fs = require('fs');
function timeout() {
return new Promise((resolve, reject) => {
setTimeout(resolve, 3000);
});
}
const url ='https://www.ygdy8.net/html/gndy/china/index.html'
// 获取首页内容
const result = puppeteer(url, () => {
let ul = document.querySelectorAll("a[class='ulink']");
let result = [];
for (let i = 0; i <= ul.length - 1; i++) {
let url = ul[i].href
let name = ul[i].text
result.push({name:name,url:url});
}
return result;
}).then((res)=>{
//获取电影的名字和链接
let data =[];
if(res.length >=2){
let n =1;
if(res.length % 2 ==0){
n =res.length/2
}else{
n = res.length -1/2
}
let data =[];
for (let i=1;i<=n ;i++){
let categorize = res[(i-1)*2].name ||"";
let categorizeUrl = res[(i-1)*2].url|| "";
let movieName = res[(i-1)*2+1].name||"";
let movieUrl = res[(i-1)*2+1].url||"";
data.push({
categorize:(categorize.match(/\[(.*?)\]/g)[0])
.replace(/\[|\]/g, ""),
categorizeUrl:categorizeUrl,
movieName:(movieName.match(/《(.*?)》/g)[0])
.replace(/《|》/g, ""),
movieUrl:movieUrl
})
}
return data
}
}).then(async res=>{
// 打开每一页获取bt链接
if(res.length>0){
const progressBar = new SingleBar({}, Presets.shades_classic);
progressBar.start(res.length, 0);
for (let i =0;i<=res.length-1;i++){
await new Promise(resolve => setTimeout(resolve, 3000));
let result= await puppeteer(
res[i].movieUrl,()=>{
if(res.length>0){
const progressBar = new SingleBar({}, Presets.shades_classic);
progressBar.start(res.length, 0);
for (let i =0;i<=res.length-1;i++){
await new Promise(resolve => setTimeout(resolve, 3000));
let result= await puppeteer(res[i].movieUrl,()=>{
return {
bt:document.querySelector("a[href^='magnet:']").
getAttribute('href')||"",
img:document.querySelector('#Zoom img').getAttribute('src')||''
};
});
res[i].bt =result.bt;
res[i].img =result.img;
progressBar.update(i + 1);
}
progressBar.stop();
return res;
}else{
console.log('没有获取到列表')
process.exit()
}
}
});
res[i].bt =result;
progressBar.update(i + 1);
}
progressBar.stop();
return res;
}else{
console.log('没有获取到列表')
process.exit()
}
}).then(res=>{
fs.writeFile(
'./data.json',
JSON.stringify(res,null, 4), (err) => {
if (err) throw err;
console.log('Data written to file');
});
});