亲宝软件园·资讯

展开

关于动态页面静态化的技术探索

文艺青年.茶 人气:0

一、准备工作

1、使用tornado部署后端服务

架构图:

1、config.py 配置端口

 

2、application配置路由

 

3、添加Handler处理响应请求

 

4、启动服务

 

打开server.py,右击选择Run ‘server’。

2、使用nuxt开前端页面

1、使用脚手架create-nuxt-app创建项目

npx create-nuxt-app test

cd test

npm run dev

2、目录结构

 

3、修改nuxt.confjg.js扩展路由

  router: {
    extendRoutes(routes, resolve){
      routes.push({
        name: 'index.html',
        path: '/index.html',
        component: resolve(__dirname, 'pages/index.vue'),
      });
    }
  }

4、修改pages/index.vue文件

 

使用asyncData方法在页面组件加载之前调用接口获取数据,模板进行展示,如下:

 

5、编译部署

cd test

npm run build

npm run start

6、页面展示

 

事情准备工作结束,接下来我们爬取静态数据。

3、使用puppteer爬取网页

1、获取HTML内容

async function fetch(url, page){
  /*
    load - 页面的load事件触发时
    domcontentloaded - 页面的DOMContentLoaded事件触发时
    networkidle0 - 不再有网络连接时触发(至少500毫秒后)
    networkidle2 - 只有2个网络连接时触发(至少500毫秒后)
  */
  await page.goto(url, {
    waitUntil: 'networkidle0'
  });
  let html = await page.content();
  return html;
}

2、提取css、js、img内容

 

解析HTML属于CPU计算,不必异步处理

 

function extract_urls(url, html){
  url = new URL(url);
  let $ = cheerio.load(html);
  let stylesheets = $('link');
  let scripts = $('script');
  let images = $('img');

  for(let stylesheet of stylesheets){
    let filename = '';
    let urlpath = '';
    let src = stylesheet.attribs.href || '';
    if(!src) continue;
    if(/^(http|https)/.test(src)){
      continue;
    }
    else if(src.indexOf('/') !== 0){
      let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
      filename = url2path(`${host}/${src}`);
      urlpath = `${host}/${src}`;
    }
    else{
      let host = `${url.protocol}//${url.host}`;
      filename = url2path(`${host}${src}`);
      urlpath = `${host}${src}`;
    }
    let index = seen_stylesheets.findIndex(s => s.filename === filename);
    if(index < 0){
      waitting_stylesheets.push({
        filename: filename,
        urlpath: urlpath
      });
    }
  }

  for(let script of scripts){
    let filename = '';
    let urlpath = '';
    let src = script.attribs.src || '';
    if(!src) continue;
    if(/^(http|https)/.test(src)){
      continue;
    }
    else if(src.indexOf('/') !== 0){
      let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
      filename = url2path(`${host}/${src}`);
      urlpath = `${host}/${src}`;
    }
    else{
      let host = `${url.protocol}//${url.host}`;
      filename = url2path(`${host}${src}`);
      urlpath = `${host}${src}`;
    }
    let index = seen_scripts.findIndex(s => s.filename === filename);
    if(index < 0){
      waitting_scripts.push({
        filename: filename,
        urlpath: urlpath
      });
    }
  }

  for(let image of images){
    let filename = '';
    let urlpath = '';
    let src = image.attribs.src || '';
    if(!src) continue;
    if(/\.(jpg|png)$/.test(src)){
      if(/^(http|https)/.test(src)){
        continue;
      }
      else if(src.indexOf('/') !== 0){
        let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
        filename = url2path(`${host}/${src}`);
        urlpath = `${host}/${src}`;
      }
      else{
        let host = `${url.protocol}//${url.host}`;
        filename = url2path(`${host}${src}`);
        urlpath = `${host}${src}`;
      }
      let index = seen_images.findIndex(s => s.filename === filename);
      if(index < 0){
        waitting_images.push({
          filename: filename,
          urlpath: urlpath
        });
      }
    }
  }
}

 

3、保存下载

使用Promise封装了异步写入,判断目录是否存在递归创建。

 

// 递归创建目录
function mkdirs(filepath) {
  if (fs.existsSync(filepath)) {
    return true;
  }
  if (mkdirs(path.dirname(filepath))) {
      fs.mkdirSync(filepath);
      return true;
  }
}

// 写入文件
function write(filename, data) {
  return new Promise((resolve, reject) => {
    mkdirs(path.dirname(filename));
    fs.writeFile(filename, data, (err) => {
      if (err) {
        reject(err);
      }
      resolve(filename)
    });
  })
}

 

4、完整代码

'use strict';
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const axios = require('axios');
const fs = require('fs');
const path = require('path');

let seen_htmls = [];
let seen_scripts = [];
let seen_stylesheets = [];
let seen_images = [];

let waitting_htmls = ["http://localhost:3000/index.html"];
let waitting_scripts = [];
let waitting_stylesheets = [];
let waitting_images = [];

// 获取html内容
async function fetch(url, page){
  /*
    load - 页面的load事件触发时
    domcontentloaded - 页面的DOMContentLoaded事件触发时
    networkidle0 - 不再有网络连接时触发(至少500毫秒后)
    networkidle2 - 只有2个网络连接时触发(至少500毫秒后)
  */
  await page.goto(url, {
    waitUntil: 'networkidle0'
  });
  let html = await page.content();
  return html;
}

// 将url路径转成本地路径
function url2path(url){
  url = new URL(url);
  return path.resolve(__dirname, './dist', url.hostname, ...url.pathname.split('/'));
}
// 提取css js img url路径
function extract_urls(url, html){
  url = new URL(url);
  let $ = cheerio.load(html);
  let stylesheets = $('link');
  let scripts = $('script');
  let images = $('img');

  for(let stylesheet of stylesheets){
    let filename = '';
    let urlpath = '';
    let src = stylesheet.attribs.href || '';
    if(!src) continue;
    if(/^(http|https)/.test(src)){
      continue;
    }
    else if(src.indexOf('/') !== 0){
      let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
      filename = url2path(`${host}/${src}`);
      urlpath = `${host}/${src}`;
    }
    else{
      let host = `${url.protocol}//${url.host}`;
      filename = url2path(`${host}${src}`);
      urlpath = `${host}${src}`;
    }
    let index = seen_stylesheets.findIndex(s => s.filename === filename);
    if(index < 0){
      waitting_stylesheets.push({
        filename: filename,
        urlpath: urlpath
      });
    }
  }

  for(let script of scripts){
    let filename = '';
    let urlpath = '';
    let src = script.attribs.src || '';
    if(!src) continue;
    if(/^(http|https)/.test(src)){
      continue;
    }
    else if(src.indexOf('/') !== 0){
      let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
      filename = url2path(`${host}/${src}`);
      urlpath = `${host}/${src}`;
    }
    else{
      let host = `${url.protocol}//${url.host}`;
      filename = url2path(`${host}${src}`);
      urlpath = `${host}${src}`;
    }
    let index = seen_scripts.findIndex(s => s.filename === filename);
    if(index < 0){
      waitting_scripts.push({
        filename: filename,
        urlpath: urlpath
      });
    }
  }

  for(let image of images){
    let filename = '';
    let urlpath = '';
    let src = image.attribs.src || '';
    if(!src) continue;
    if(/\.(jpg|png)$/.test(src)){
      if(/^(http|https)/.test(src)){
        continue;
      }
      else if(src.indexOf('/') !== 0){
        let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
        filename = url2path(`${host}/${src}`);
        urlpath = `${host}/${src}`;
      }
      else{
        let host = `${url.protocol}//${url.host}`;
        filename = url2path(`${host}${src}`);
        urlpath = `${host}${src}`;
      }
      let index = seen_images.findIndex(s => s.filename === filename);
      if(index < 0){
        waitting_images.push({
          filename: filename,
          urlpath: urlpath
        });
      }
    }
  }
}

// 递归创建目录
function mkdirs(filepath) {
  if (fs.existsSync(filepath)) {
    return true;
  }
  if (mkdirs(path.dirname(filepath))) {
      fs.mkdirSync(filepath);
      return true;
  }
}

// 写入文件
function write(filename, data) {
  return new Promise((resolve, reject) => {
    mkdirs(path.dirname(filename));
    fs.writeFile(filename, data, (err) => {
      if (err) {
        reject(err);
      }
      resolve(filename)
    });
  })
}

(async () => {
  const browser = await puppeteer.launch({headless: false});
  const page = await browser.newPage();
  for(let start_url of waitting_htmls){
    let html = await fetch(start_url, page);  
    // html
    write(url2path(start_url), html)
    .then((filename) => {
      console.log(filename + ' 写入完成');
      seen_htmls.push(start_url);
    })
    .catch((err) => {
      console.log(err);
    });
    // 解析html
    extract_urls(start_url, html);
    // css
    for(let stylesheet of waitting_stylesheets){
      let resource = '';
      let res = await axios.get(stylesheet.urlpath);
      if(res && res.status === 200){
        resource = res.data;
      }
      write(stylesheet.filename, resource)
      .then((filename) => {
        console.log(filename + ' 写入完成');
        seen_stylesheets.push(Object.assign({}, stylesheet));
      })
      .catch((err) => {
        console.log(err);
      });
    }
    waitting_stylesheets = []
    // js
    for(let script of waitting_scripts){
      let resource = '';
      let res = await axios.get(script.urlpath);
      if(res && res.status === 200){
        resource = res.data;
      }
      write(script.filename, resource)
      .then((filename) => {
        console.log(filename + ' 写入完成');
        seen_scripts.push(Object.assign({}, script));
      })
      .catch((err) => {
        console.log(err);
      });
    }
    waitting_scripts = []
    // image
    for(let image of waitting_images){
      let resource = '';
      let res = await axios.get(image.urlpath, {
        responseType: 'arraybuffer'
      });
      if(res && res.status === 200){
        resource = res.data;
      }
      write(image.filename, resource)
      .then((filename) => {
        console.log(filename + ' 写入完成');
        seen_images.push(Object.assign({}, image));
      })
      .catch((err) => {
        console.log(err);
      });
    }
    waitting_images = []
  }
  waitting_htmls = []
  await page.close();
  await browser.close();
})();

5、执行展示

 

成功抓取到所有静态资源文件,就差最后部署。

4、使用express进行部署

 

const express = require('express');

const app = express();

// app.use(express.static('dist/app2.jg.eastmoney.com'));
app.use(express.static('dist/localhost'));

app.listen(3001);

 

使用node server.js启动express,打开http://localhost:3001,网页不会向后端请求http://localhost:8089/api/test,降低服务器压力。

 

加载全部内容

相关教程
猜你喜欢
用户评论