全局安装typescript:
创新互联成立10年来,这条路我们正越走越好,积累了技术与客户资源,形成了良好的口碑。为客户提供成都网站建设、成都网站设计、网站策划、网页设计、域名申请、网络营销、VI设计、网站改版、漏洞修补等服务。网站是否美观、功能强大、用户体验好、性价比高、打开快等等,这些对于网站建设都非常重要,创新互联通过对建站技术性的掌握、对创意设计的研究为客户提供一站式互联网解决方案,携手广大客户,共同发展进步。
- npm install -g typescript
目前版本2.0.3,这个版本不再需要使用typings命令了。但是vscode捆绑的版本是1.8的,需要一些配置工作,看本文的处理办法。
测试tsc命令:
- tsc
创建要写的程序项目文件夹:
- mkdir test-typescript-spider
进入该文件夹:
- cd test-typescript-spider
初始化项目:
- npm init
安装superagent和cheerio模块:
- npm i --save superagent cheerio
安装对应的类型声明模块:
- npm i -s @types/superagent --save
- npm i -s @types/cheerio --save
安装项目内的typescript(必须走这一步):
- npm i --save typescript
用vscode打开项目文件夹。在该文件夹下创建tsconfig.json文件,并复制以下配置代码进去:
- {
- "compilerOptions": {
- "target": "ES6",
- "module": "commonjs",
- "noEmitOnError": true,
- "noImplicitAny": true,
- "experimentalDecorators": true,
- "sourceMap": false,
- // "sourceRoot": "./",
- "outDir": "./out"
- },
- "exclude": [
- "node_modules"
- ]
- }
在vscode打开“文件”-“***项”-“工作区设置”在settings.json中加入(如果不做这个配置,vscode会在打开项目的时候提示选择哪个版本的typescript):
- {
- "typescript.tsdk": "node_modules/typescript/lib"
- }
创建api.ts文件,复制以下代码进去:
- import superagent = require('superagent');
- import cheerio = require('cheerio');
- export const remote_get = function(url: string) {
- const promise = new Promise
(function (resolve, reject) { - superagent.get(url)
- .end(function (err, res) {
- if (!err) {
- resolve(res);
- } else {
- console.log(err)
- reject(err);
- }
- });
- });
- return promise;
- }
创建app.ts文件,书写测试代码:
- import api = require('./api');
- const go = async () => {
- let res = await api.remote_get('http://www.baidu.com/');
- console.log(res.text);
- }
- go();
执行命令:
- tsc
然后:
- node out/app
观察输出是否正确。
现在尝试抓取http://cnodejs.org/的***页文章链接。
修改app.ts文件,代码如下:
- import api = require('./api');
- import cheerio = require('cheerio');
- const go = async () => {
- const res = await api.remote_get('http://cnodejs.org/');
- const $ = cheerio.load(res.text);
- let urls: string[] = [];
- let titles: string[] = [];
- $('.topic_title_wrapper').each((index, element) => {
- titles.push($(element).find('.topic_title').first().text().trim());
- urls.push('http://cnodejs.org/' + $(element).find('.topic_title').first().attr('href'));
- })
- console.log(titles, urls);
- }
- go();
观察输出,文章的标题和链接都已获取到了。
现在尝试深入抓取文章内容
- import api = require('./api');
- import cheerio = require('cheerio');
- const go = async () => {
- const res = await api.remote_get('http://cnodejs.org/');
- const $ = cheerio.load(res.text);
- $('.topic_title_wrapper').each(async (index, element) => {
- let url = ('http://cnodejs.org' + $(element).find('.topic_title').first().attr('href'));
- const res_content = await api.remote_get(url);
- const $_content = cheerio.load(res_content.text);
- console.log($_content('.topic_content').first().text());
- })
- }
- go();
可以发现因为访问服务器太迅猛,导致出现很多次503错误。
解决:
添加helper.ts文件:
- export const wait_seconds = function (senconds: number) {
- return new Promise(resolve => setTimeout(resolve, senconds * 1000));
- }
修改api.ts文件为:
- import superagent = require('superagent');
- import cheerio = require('cheerio');
- export const get_index_urls = function () {
- const res = await remote_get('http://cnodejs.org/');
- const $ = cheerio.load(res.text);
- let urls: string[] = [];
- $('.topic_title_wrapper').each(async (index, element) => {
- urls.push('http://cnodejs.org' + $(element).find('.topic_title').first().attr('href'));
- });
- return urls;
- }
- export const get_content = async function (url: string) {
- const res = await remote_get(url);
- const $ = cheerio.load(res.text);
- return $('.topic_content').first().text();
- }
- export const remote_get = function (url: string) {
- const promise = new Promise
(function (resolve, reject) { - superagent.get(url)
- .end(function (err, res) {
- if (!err) {
- resolve(res);
- } else {
- console.log(err)
- reject(err);
- }
- });
- });
- return promise;
- }
修改app.ts文件为:
- import api = require('./api');
- import helper = require('./helper');
- import cheerio = require('cheerio');
- const go = async () => {
- let urls = await api.get_index_urls();
- for (let i = 0; i < urls.length; i++) {
- await helper.wait_seconds(1);
- let text = await api.get_content(urls[i]);
- console.log(text);
- }
- }
- go();
观察输出可以看到,程序实现了隔一秒再请求下一个内容页。
现在尝试把抓取到的东西存到数据库中。安装mongoose模块:
- npm i mongoose --save
- npm i -s @types/mongoose --save
然后建立Scheme。先创建models文件夹:
- mkdir models
在models文件夹下创建index.ts:
- import * as mongoose from 'mongoose';
- mongoose.connect('mongodb://127.0.0.1/cnodejs_data', {
- server: { poolSize: 20 }
- }, function (err) {
- if (err) {
- process.exit(1);
- }
- });
- // models
- export const Article = require('./article');
在models文件夹下创建IArticle.ts:
- interface IArticle {
- title: String;
- url: String;
- text: String;
- }
- export = IArticle;
在models文件夹下创建Article.ts:
- import mongoose = require('mongoose');
- import IArticle = require('./IArticle');
- interface IArticleModel extends IArticle, mongoose.Document { }
- const ArticleSchema = new mongoose.Schema({
- title: { type: String },
- url: { type: String },
- text: { type: String },
- });
- const Article = mongoose.model
("Article", ArticleSchema); - export = Article;
修改api.ts为:
- import superagent = require('superagent');
- import cheerio = require('cheerio');
- import models = require('./models');
- const Article = models.Article;
- export const get_index_urls = async function () {
- const res = await remote_get('http://cnodejs.org/');
- const $ = cheerio.load(res.text);
- let urls: string[] = [];
- $('.topic_title_wrapper').each((index, element) => {
- urls.push('http://cnodejs.org' + $(element).find('.topic_title').first().attr('href'));
- });
- return urls;
- }
- export const fetch_content = async function (url: string) {
- const res = await remote_get(url);
- const $ = cheerio.load(res.text);
- let article = new Article();
- article.text = $('.topic_content').first().text();
- article.title = $('.topic_full_title').first().text().replace('置顶', '').replace('精华', '').trim();
- article.url = url;
- console.log('获取成功:' + article.title);
- article.save();
- }
- export const remote_get = function (url: string) {
- return new Promise
((resolve, reject) => { - superagent.get(url)
- .end(function (err, res) {
- if (!err) {
- resolve(res);
- } else {
- reject(err);
- }
- });
- });
- }
修改app.ts为:
- import api = require('./api');
- import helper = require('./helper');
- import cheerio = require('cheerio');
- (async () => {
- try {
- let urls = await api.get_index_urls();
- for (let i = 0; i < urls.length; i++) {
- await helper.wait_seconds(1);
- await api.fetch_content(urls[i]);
- }
- } catch (err) {
- console.log(err);
- }
- console.log('完毕!');
- })();
执行
- tsc
- node out/app
观察输出,并去数据库检查一下可以发现入库成功了!
补充:remote_get方法的改进版,实现错误重试和加入代理服务器.放弃了superagent库,用的request库,仅供参考:
- //config.retries = 3;
- let current_retry = config.retries || 0;
- export const remote_get = async function (url: string, proxy?: string) {
- //每次请求都先稍等一下
- await wait_seconds(2);
- if (!proxy) {
- proxy = '';
- }
- const promise = new Promise
(function (resolve, reject) { - console.log('get: ' + url + ', using proxy: ' + proxy);
- let options: request.CoreOptions = {
- headers: {
- 'Cookie': '',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
- 'Referer': 'https://www.baidu.com/'
- },
- encoding: 'utf-8',
- method: 'GET',
- proxy: proxy,
- timeout: 3000,
- }
- request(url, options, async function (err, response, body) {
- console.log('got:' + url);
- if (!err) {
- body = body.toString();
- current_retry = config.retries || 0;
- console.log('bytes:' + body.length);
- resolve(body);
- } else {
- console.log(err);
- if (current_retry <= 0) {
- current_retry = config.retries || 0;
- reject(err);
- } else {
- console.log('retry...(' + current_retry + ')')
- current_retry--;
- try {
- let body = await remote_get(url, proxy);
- resolve(body);
- } catch (e) {
- reject(e);
- }
- }
- }
- });
- });
- return promise;
- }
另外,IArticle.ts和Article.ts合并为一个文件,可能更好,可以参考我另一个model的写法:
- import mongoose = require('mongoose');
- interface IProxyModel {
- uri: string;
- ip: string;
- port:string;
- info:string;
- }
- export interface IProxy extends IProxyModel, mongoose.Document { }
- const ProxySchema = new mongoose.Schema({
- uri: { type: String },//
- ip: { type: String },//
- port: { type: String },//
- info: { type: String },//
- });
- export const Proxy = mongoose.model
("Proxy", ProxySchema);
导入的时候这么写就行了:
- import { IProxy, Proxy } from './models';
其中Proxy可以用来做new、find、where之类的操作:
- let x = new Proxy();
- let xx = await Proxy.find({});
- let xxx = await Proxy.where('aaa',123).exec();
而IProxy用于实体对象的传递,例如
- function xxx(p:IProxy){
- }
网站题目:用TypeScript开发爬虫程序
本文地址:http://www.csdahua.cn/qtweb/news33/405433.html
网站建设、网络推广公司-快上网,是专注品牌与效果的网站制作,网络营销seo公司;服务项目有等
声明:本网站发布的内容(图片、视频和文字)以用户投稿、用户转载内容为主,如果涉及侵权请尽快告知,我们将会在第一时间删除。文章观点不代表本网站立场,如需处理请联系客服。电话:028-86922220;邮箱:631063699@qq.com。内容未经允许不得转载,或转载时需注明来源: 快上网