概要
機械学習用にコンテンツデータを集めないと行けなくて、毎回クローラー書くの面倒だったので、汎用的なクローラーを開発
構成
仕組み
LinkCrawlerで起点のURLからリンクを辿り対象のURL収集しDBに保存、
ContentsCrawlerで収集したURLを取得しコンテンツの取得を行ってDBに保存する
AWS
アプリケーション
・puppeteer
・node.js
DataBase
table
CREATETABLE`site`(`id`int(11)unsignedNOTNULLAUTO_INCREMENT,`title`varchar(255)NOTNULL,`url`varchar(300)NOTNULL,`created_at`datetimeNOTNULL,`updated_at`datetimeNOTNULL,PRIMARYKEY(`id`))ENGINE=InnoDBDEFAULTCHARSET=utf8;CREATETABLE`site_links`(`id`int(11)unsignedNOTNULLAUTO_INCREMENT,`site_worker_id`int(11)NOTNULL,`url`varchar(300)NOTNULL,`crawl_status`int(11)NOTNULLDEFAULT'0',`crawl_date`datetimeDEFAULTNULL,`created_at`datetimeNOTNULL,`updated_at`datetimeNOTNULL,PRIMARYKEY(`id`))ENGINE=InnoDBDEFAULTCHARSET=utf8;CREATETABLE`site_structure_data`(`id`int(11)unsignedNOTNULLAUTO_INCREMENT,`site_links_id`int(11)NOTNULL,`structure_data`textNOTNULL,`created_at`datetimeNOTNULL,`updated_at`datetimeNOTNULL,PRIMARYKEY(`id`))ENGINE=InnoDBDEFAULTCHARSET=utf8mb4;CREATETABLE`site_worker`(`id`int(11)unsignedNOTNULLAUTO_INCREMENT,`start_url`varchar(300)NOTNULL,`allow_domains`varchar(300)NOTNULL,`depth_limit`tinyint(4)NOTNULLDEFAULT'0',`allow_url_regex`varchar(300)DEFAULTNULL,`deny_url_regex`varchar(300)DEFAULTNULL,`site_type`varchar(10)NOTNULL,`json_column`varchar(10)DEFAULTNULL,`is_deleted`tinyint(4)NOTNULLDEFAULT'0',`created_at`datetimeNOTNULL,`updated_at`datetimeNOTNULL,PRIMARYKEY(`id`))ENGINE=InnoDBDEFAULTCHARSET=utf8;CREATETABLE`site_worker_structure`(`id`int(11)unsignedNOTNULLAUTO_INCREMENT,`site_worker_id`int(11)NOTNULL,`name`varchar(100)NOTNULL,`is_deleted`tinyint(4)NOTNULLDEFAULT'0',`created_at`datetimeNOTNULL,`updated_at`datetimeNOTNULL,PRIMARYKEY(`id`))ENGINE=InnoDBDEFAULTCHARSET=utf8;CREATETABLE`site_worker_structure_selector`(`id`int(11)unsignedNOTNULLAUTO_INCREMENT,`site_worker_id`int(11)NOTNULL,`site_worker_structure_id`int(11)NOTNULL,`selector`varchar(1000)NOTNULL,`attribute`varchar(100)DEFAULTNULL,`created_at`datetimeNOTNULL,`updated_at`datetimeNOTNULL,PRIMARYKEY(`id`))ENGINE=InnoDBDEFAULTCHARSET=utf8;クローラー
link_crawler.js
require('dotenv').config();constpuppeteer=require('puppeteer');constconfig=require('config');constmysql=require('mysql2/promise');constrequest=require('request-promise');const{URL}=require('url');constviewportWidth=1024;constviewportHeight=600;constuserDataDir=`${config.root_path}/tmp/crawler`;letconnection;constuserAgent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36';console.log('NODE_ENV=%s',process.env.NODE_ENV);constin_array=(arr,str)=>{letret=false;for(constiinarr){if(arr[i].url===str){ret=true;break;}}returnret;};asyncfunctionlinkCrawler(browser,args){constlinks=[];awaitgetContents(browser,args,links);for(constiinlinks){constdata=[links[i].site_worker_id,links[i].url];const[rows,fields]=awaitconnection.execute('SELECT id FROM site_links WHERE site_worker_id = ? AND url = ?',data);if(rows.length>0){continue;}awaitconnection.execute('INSERT INTO site_links(site_worker_id, url, created_at, updated_at) VALUES(?, ?, now(), now())',data);}console.log(links);}asyncfunctiongetJson(args){constsite_worker_id=args.id;conststart_url=args.start_url;constallow_domains=args.allow_domains;constjson_column=args.json_column;console.log('start_url',start_url);constoptions={url:start_url,json:true,headers:{'User-Agent':userAgent}};constlist=awaitrequest(options);for(constiinlist.list){constdetail=list.list[i];constdata=[site_worker_id,`https://${allow_domains}${detail[json_column]}`];console.log(data);const[rows,fields]=awaitconnection.execute('SELECT id FROM site_links WHERE site_worker_id = ? AND url = ?',data);if(rows.length>0){continue;}awaitconnection.execute('INSERT INTO site_links(site_worker_id, url, created_at, updated_at) VALUES(?, ?, now(), now())',data);}}asyncfunctiongetContents(browser,args,links){constpage=awaitnewPage(browser);constsite_worker_id=args.id;conststart_url=args.start_url;letdepth=1;constallow_domains=args.allow_domains;constallow_url_regex=args.allow_url_regex;constdeny_url_regex=args.deny_url_regex;constdepth_limit=args.depth_limit;if(args.depth){depth=args.depth+1;}console.log('start_url',start_url);awaitpage.goto(start_url);awaitpage.waitFor(1000);constitems=awaitpage.$$('a');for(leti=0;i<items.length;i++){letpattern;conststr_href=await(awaititems[i].getProperty('href')).jsonValue();if(str_href===''){continue;}consturl=newURL(str_href);// remove hashurl.hash='';conststr_url=url.href;if(in_array(links,str_url)===true){continue;}// domainif(allow_domains!==undefined){pattern=newRegExp(`//${allow_domains.replace(',','|//')}`);if(str_url.match(pattern)===null){continue;}}// url patternif(allow_url_regex!==null){pattern=newRegExp(allow_url_regex);if(str_url.match(pattern)===null){continue;}}if(deny_url_regex!==null){pattern=newRegExp(deny_url_regex);if(str_url.match(pattern)!==null){continue;}}console.log(depth,str_url);links.push({url:str_url,depth:depth,site_worker_id:site_worker_id});if(depth_limit>depth){constparams={};Object.assign(params,args);params.start_url=str_url;params.depth=depth;awaitgetContents(browser,params,links);}}awaitpage.close();}asyncfunctionnewPage(browser){constpage=awaitbrowser.newPage();awaitpage.setExtraHTTPHeaders({'Accept-Language':'ja,en-US;q=0.9,en;q=0.8'});constoptions={viewport:{width:viewportWidth,height:viewportHeight,},userAgent,};awaitpage.emulate(options);returnpage;}(async()=>{try{connection=awaitmysql.createConnection({host:'localhost',user:'root',password:'',database:'crawler'});const[rows,fields]=awaitconnection.execute('SELECT * FROM site_worker WHERE is_deleted = 0');if(rows===undefined||rows.length<=0){console.log('no data');connection.end();return;}constparams=rows[0];if(params.site_type==='contents'){constbrowser=awaitpuppeteer.launch({headless:false,devtools:false,executablePath:config.chrome,userDataDir:userDataDir,args:['--no-sandbox','--disable-setuid-sandbox'],});awaitlinkCrawler(browser,params);browser.close();}if(params.site_type==='json'){awaitgetJson(params);}connection.end();}catch(e){console.error(e);}})();contents_crawler.js
require('dotenv').config();constpuppeteer=require('puppeteer');constmoment=require('moment');constconfig=require('config');constmysql=require('mysql2/promise');const{URL}=require('url');constuuidv4=require('uuid/v4');constviewportWidth=1024;constviewportHeight=600;constuserDataDir=`${config.root_path}/tmp/crawler`;letconnection;console.log('NODE_ENV=%s',process.env.NODE_ENV);asyncfunctioncontentsCrawler(browser,links,structure){for(constiinlinks){awaitgetContents(browser,links[i],structure);}}asyncfunctiongetContents(browser,args,structure){constpage=awaitnewPage(browser);constid=args.id;constsite_worker_id=args.site_worker_id;consturl=args.url;console.log('crawl_url',url);constresponse=awaitpage.goto(url);awaitpage.waitFor(10000);conststatus=response.status();constdata={};for(constiinstructure){constname=structure[i].name;constselector_dic=structure[i].selector;for(constjinselector_dic){constselector=selector_dic[j].selector;constattribute=selector_dic[j].attribute;constitem=awaitpage.$(selector);if(item===null){data[name]=null;continue;}if(attribute===null){data[name]=await(awaititem.getProperty('textContent')).jsonValue();break;}if(attribute==='src'){constfile_name=uuidv4();constpath=`tmp/images/${file_name}.jpg`;constimage=awaitpage.$(selector);awaitimage.screenshot({path:path,omitBackground:true,});data[name]=path;break;}data[name]=await(awaititem.getProperty(attribute)).jsonValue();if(data[name]){break;}}}letparams=[JSON.stringify(data),id];letsql='INSERT INTO site_structure_data(structure_data, site_links_id, created_at, updated_at) VALUES(?, ?, now(), now())';const[rows,fields]=awaitconnection.execute('SELECT id FROM site_structure_data WHERE site_links_id = ?',[id]);if(rows.length>0){sql='UPDATE site_structure_data set structure_data = ?, updated_at = now() WHERE site_links_id= ?';}awaitconnection.execute(sql,params);params=[status,id,site_worker_id];awaitconnection.execute('UPDATE site_links SET crawl_status = ?, crawl_date = now() WHERE id = ? AND site_worker_id = ?',params);awaitpage.close();returndata;}asyncfunctionnewPage(browser){constpage=awaitbrowser.newPage();awaitpage.setExtraHTTPHeaders({'Accept-Language':'ja,en-US;q=0.9,en;q=0.8'});constoptions={viewport:{width:viewportWidth,height:viewportHeight,},userAgent:'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',};awaitpage.emulate(options);returnpage;}(async()=>{try{connection=awaitmysql.createConnection({host:'localhost',user:'root',password:'',database:'crawler',charset:'utf8mb4'});constsite_worker_id=4;const[structure]=awaitconnection.execute('SELECT id, site_worker_id, name FROM site_worker_structure WHERE site_worker_id = ? order by id',[site_worker_id]);if(structure===undefined||structure.length<=0){console.log('no data');connection.end();return;}const[structure_selector]=awaitconnection.execute('SELECT id, site_worker_id, site_worker_structure_id, selector, attribute FROM site_worker_structure_selector WHERE site_worker_id = ?',[site_worker_id]);if(structure_selector===undefined||structure_selector.length<=0){console.log('no data');connection.end();return;}for(constiinstructure){for(constjinstructure_selector){if(structure[i].id!==structure_selector[j].site_worker_structure_id){continue;}if(structure[i].selector===undefined){structure[i].selector=[];}constselector={selector:structure_selector[j].selector,attribute:structure_selector[j].attribute};structure[i].selector.push(selector);}}const[links]=awaitconnection.execute('SELECT id, site_worker_id, url FROM site_links WHERE site_worker_id = ? AND crawl_status = 0',[site_worker_id]);if(links===undefined||links.length<=0){console.log('no data');connection.end();return;}constbrowser=awaitpuppeteer.launch({headless:false,devtools:false,executablePath:config.chrome,userDataDir:userDataDir,args:['--no-sandbox','--disable-setuid-sandbox'],});awaitcontentsCrawler(browser,links,structure);browser.close();connection.end();}catch(e){console.error(e);}})();最後に
うまく動かない場合はごめんなさい
ツールを作ってweb上から設定できるようにしていますが、長いので割愛

