csharp D-Link DIR-859 爬蟲 繼之前的取得 IP or MacAddress 又遇到個變態的問題,要抓 D-Link 上面的設備 IP,感覺對這些很陌生,先寫個爬蟲來試看看,以後有更好的方法再換 爬蟲有一堆 lib 可以選擇,這次用 puppeteer-sharp  比較困難點就是要注意操作 ajax or 按鈕這類動作要讓他睡, 此外操作 DOM 的方式也是滿特別的建議要看這個官方的單元測試 最重要的 example 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 using  PuppeteerSharp;using  System;using  System.Collections.Generic;using  System.IO;using  System.Linq;using  System.Text;using  System.Threading;using  System.Threading.Tasks;namespace  ConsoleDIR859 {     class  Program      {         static  void  Main (string [] args )          {            try              {                 Console.WriteLine( "DIR-859 Get Client IP Example:"  );                 string  pwd = "your password" ;                 var  ips = GetAllDeviceIP(pwd );                 foreach  (var  ip in  ips.Result) Console.WriteLine( ip );                 Console.WriteLine( "Press Any Key To Exit.."  );                 Console.ReadKey( );             }             catch  (Exception ex)             {                 Console.WriteLine( ex );                 throw ;             }         }         static  async  Task<string []> GetAllDeviceIP(string  pwd)         {             try              {                 await  new  BrowserFetcher( ).DownloadAsync( BrowserFetcher.DefaultRevision );                 using  (var  browser = await  Puppeteer.LaunchAsync( new  LaunchOptions( )                 {                     Headless = true                  } ))                 {                     using  (var  page = await  browser.NewPageAsync( ))                     {                         await  page.GoToAsync( "http://192.168.0.1/info/Login.html"  );                         await  page.WaitForSelectorAsync( "#admin_Password"  );                         await  page.FocusAsync( "#admin_Password"  );                         Thread.Sleep( 500  );                         await  page.Keyboard.TypeAsync( pwd );                         await  page.ClickAsync( "#logIn_btn"  );                         var  result = await  page.WaitForNavigationAsync( );                                                                                                    var  resp = await  page.GoToAsync( result.Url );                         Thread.Sleep( 3000  );                         await  page.ClickAsync( "#client_image"  );                         Thread.Sleep( 1000  );                         string  content = await  page.GetContentAsync( );                                                                                                                                                      var  clientItems = await  page.QuerySelectorAsync( "#Client_items"  );                         var  ips = await  clientItems.QuerySelectorAllHandleAsync( ".client_IPv4Address"  )                             .EvaluateFunctionAsync<string []>( "nodes => nodes.map(n => n.innerText)"  );                         return  ips;                     }                 }             }             catch  (Exception ex)             {                 throw ;             }         }     } } 
 
java CheckIn 無聊寫個 java 版本的運用 , 主要使用這個 lib jvppeteer  , 寫起來跟 .net 版本的有點不太一樣 , 好像更無腦
先在 pom.xml 加入這段安裝套件
1 2 3 4 5 6 7 <dependencies> 	<dependency> 		<groupId>io.github.fanyong920</groupId> 		<artifactId>jvppeteer</artifactId> 		<version>1.1.3</version> 	</dependency> </dependencies> 
 
撰寫 code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 package com.company; import com.ruiyun.jvppeteer.core.Puppeteer; import com.ruiyun.jvppeteer.core.browser.Browser; import com.ruiyun.jvppeteer.core.browser.BrowserFetcher; import com.ruiyun.jvppeteer.core.page.ElementHandle; import com.ruiyun.jvppeteer.core.page.JSHandle; import com.ruiyun.jvppeteer.core.page.Page; import com.ruiyun.jvppeteer.options.LaunchOptions; import com.ruiyun.jvppeteer.options.LaunchOptionsBuilder; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.concurrent.ExecutionException; public class Main {     public static void main(String[] args) throws IOException, ExecutionException, InterruptedException {         Page page = LoadPage();         //量體溫         health(page);         //上班打卡         //checkIn(page);         //下班打卡         //checkOut(page);     }     public static Page LoadPage() throws IOException, ExecutionException, InterruptedException {         String path = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe";         ArrayList<String> argList = new ArrayList<>();         //BrowserFetcher.downloadIfNotExist(null);         LaunchOptions options = new LaunchOptionsBuilder().withArgs(argList).withHeadless(false).withExecutablePath(path).build();         argList.add("--no-sandbox");         argList.add("--disable-setuid-sandbox");         Browser browser = Puppeteer.launch(options);         Page page = browser.newPage(); 		Strin url = "http://127.0.0.1:5500/index.html";         page.goTo(url);         return page;     }     public static void health(Page page) throws ExecutionException, InterruptedException {         Thread.sleep(5000);         List<String> select = page.select("select", Collections.singletonList("number:1"));         for (String s : select) {             System.out.println(s);         }     }     public static void checkIn(Page page) throws ExecutionException, InterruptedException {         Thread.sleep(2000);         ElementHandle btn = page.$(".checkIn");         String text = btn.getProperty("textContent").toString();         System.out.println(text); 		btn.click();     }     public static void checkOut(Page page) throws ExecutionException, InterruptedException {         Thread.sleep(2000);         ElementHandle btn = page.$(".checkOut");         String text = btn.getProperty("textContent").toString();         System.out.println(text); 		btn.click();     } } 
 
csharp CheckIn 需要注意有可能會殘留一堆的 browser 在處理程序上應該是要呼叫這句關掉 await browser.CloseAsync 體驗上反而 java 版本比較好寫
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 using PuppeteerSharp; using System; using System.Threading; using System.Threading.Tasks; namespace CheckInCrawler {     class Program     { 		static string url = "http://127.0.0.1:5500/index.html";         static void Main( string[] args )         {             try             {                 if (args[0]?.ToUpper() == "CheckIn".ToUpper())                 {                     CheckIn().Wait();                 }                 if (args[0]?.ToUpper() == "CheckOut".ToUpper())                 {                     CheckOut().Wait();                 }             }             catch (Exception ex)             {                 Console.WriteLine( "Error" );                 Console.WriteLine( ex );             }             Console.ReadLine();         }         static async Task Health()         {             await new BrowserFetcher().DownloadAsync( BrowserFetcher.DefaultRevision );             var browser = await Puppeteer.LaunchAsync( new LaunchOptions             {                 Headless = false             } );             var page = await browser.NewPageAsync();             await page.GoToAsync( url );             await Health( page );             Thread.Sleep( 2500 );         }         static async Task Health(Page page)         {             try             {                 Thread.Sleep( 5000 );                 var healthSelect = await page.QuerySelectorAllAsync(                     "select"                     );                 var select = healthSelect[0];                 //選擇健康                 if(select != null)                 {                     var normal = await select.SelectAsync( "number:1" );                     Console.WriteLine( $"health:number" );                     Thread.Sleep( 2500 );                 }                 else                 {                     Console.WriteLine("Health Select Is Null");                 }             }             catch (Exception ex)             {                 Console.WriteLine( "Health Error:" );                 Console.WriteLine(ex.ToString());                 Console.WriteLine( "Continue:" );             }         }         static async Task CheckIn()         {             await new BrowserFetcher().DownloadAsync( BrowserFetcher.DefaultRevision );             var browser = await Puppeteer.LaunchAsync( new LaunchOptions             {                 Headless = false             } );             var page = await browser.NewPageAsync();             await page.GoToAsync( url );             Thread.Sleep( 2500 );             await Health( page );             var btnCheckIn = await page.QuerySelectorAllAsync( ".CheckIn" );             Thread.Sleep( 2500 );             var btn = btnCheckIn[0];             if(btn is not null)             {                 await btn.ClickAsync();                 Console.WriteLine( "checkIn" );                 Thread.Sleep( 2500 );             }             else             {                 Console.WriteLine("CheckIn Btn Is Null");             }             //await browser.CloseAsync();         }         static async Task CheckOut()         {             await new BrowserFetcher().DownloadAsync( BrowserFetcher.DefaultRevision );             var browser = await Puppeteer.LaunchAsync( new LaunchOptions             {                 Headless = false             } );             var page = await browser.NewPageAsync();             await page.GoToAsync( url );             Thread.Sleep( 2500 );             await Health( page );             var btnCheckIn = await page.QuerySelectorAllAsync( ".CheckOut" );             Thread.Sleep( 2500 );             var btn = btnCheckIn[0];             if(btn is not null)             {                 await btn.ClickAsync();                 Console.WriteLine( "checkOut" );                 Thread.Sleep( 2500 );             }             else             {                 Console.WriteLine("CheckOut Btn Is Null");             }             //await browser.CloseAsync();         }     } } 
 
nodejs 用法 首先可以到他的官方Puppeteer  安裝後預設會在 $HOME/.cache/puppeteer 底下安裝最近版本的 chrome Windows 等價於 %userprofile%/.cache/puppeteer
 
預設 nodejs 是沒法跑的 , 要在 package.json 加上 "type": "module"
1 2 3 4 5 6 {     "type": "module",     "dependencies": {         "puppeteer": "^21.4.1"     } } 
 
新版的 headless 參數有些不一樣
1 2 3 4 5 6 7 8 9 10 const  browser = await  puppeteer.launch({         headless: false ,               defaultViewport: null ,     args: ['--start-maximized' ] }); 
 
然後官方給的 example 預設開啟會是一個空頁 , 然後又開一頁 , 可以改這樣寫就只會有一頁
1 2 3 const  pages = await  browser.pages();const  page = pages[0 ];
 
然後就是 帳號 密碼 登入
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 const  loginUrl = '' ;await  page.goto(loginUrl);const  username = '' ;const  password = '' ;const  usernameSelector = '#username' ;await  page.waitForSelector(usernameSelector);await  page.focus(usernameSelector);await  page.keyboard.type(username);const  passwordSelector = '#password' ;await  page.waitForSelector(passwordSelector);await  page.focus(passwordSelector);await  page.keyboard.type(password);const  loginSelector = '#login' ;await  page.waitForSelector(loginSelector);await  page.click(loginSelector);
 
再來是用 evaluate 去使用 querySelector 最後回傳數值
1 2 3 4 5 const  downloadViewUrl = await  page.evaluate(()  =>  {    let  aTag = document .querySelector('#leftsidebar a' )     return  aTag.href; }); await  page.goto(downloadViewUrl);
 
最後是一個存成 json 的方法
1 2 3 4 5 6 7 8 9 //存成 json 正式應該要存到 db const finalResult = JSON.stringify(resultRows); fs.writeFile('data.json', finalResult, (error) => {     if (error) {         console.log(error)         throw error     }     console.log('done!') }) 
 
然後下面大致上是一個 example code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 import  puppeteer from  'puppeteer' ;import  * as  fs from  'fs' ;(async  () => {          const  browser = await  puppeteer.launch({                  headless: false ,                           defaultViewport: null ,         args: ['--start-maximized' ]     });                    const  pages = await  browser.pages();     const  page = pages[0 ];                    const  loginUrl = '' ;     await  page.goto(loginUrl);     const  username = '' ;     const  password = '' ;     const  usernameSelector = '#uername' ;     await  page.waitForSelector(usernameSelector);     await  page.focus(usernameSelector);          await  page.keyboard.type(username);     const  passwordSelector = '#pssword' ;     await  page.waitForSelector(passwordSelector);     await  page.focus(passwordSelector);          await  page.keyboard.type(password);          const  loginSelector = '#login' ;     await  page.waitForSelector(loginSelector);     await  page.click(loginSelector);     const  OXUrl = '' ;     await  page.goto(OXUrl);          const  tableSelector = '#main table' ;     await  page.waitForSelector(tableSelector);     await  page.click(tableSelector);          const  firstSensorUrl = await  page.evaluate(()  =>  {         let  rows = document .querySelector('#main table tbody tr' );         let  firstSensorATag = rows.querySelector('td:nth-child(4) > a' )         console .log(firstSensorATag.href);         return  firstSensorATag.href;     });     console .log(firstSensorUrl);     await  page.goto(firstSensorUrl);          const  downloadViewUrl = await  page.evaluate(()  =>  {         let  aTag = document .querySelector('#leftsidebar a' )         return  aTag.href;     });     await  page.goto(downloadViewUrl);               const  titles = await  page.evaluate(()  =>  {         let  heads = document .querySelectorAll('#main table thead tr th' )         let  result = []         for  (const  [key, value] of  Object .entries(heads)) {             if  (key <= 15 ) result.push(value.innerText)         }         return  result     });          const  realValues = await  page.evaluate(()  =>  {         let  values = document .querySelectorAll('#main table tbody tr td' )         let  result = []         let  row = []         let  counter = 0          for  (const  [key, value] of  Object .entries(values)) {                          if  (row.length === 16 ) {                 result.push(row)                 row = []             }                          if  (counter < 16 ) row.push(value.innerText)             counter++                          if  (counter > 17 ) counter = 0          }         return  result     });          let  pairs = []     let  resultRows = []          for  (let  r = 0 ; r < realValues.length; r++) {         for  (let  i = 0 ; i < titles.length; i++) {                          let  title = titles[i]                          let  row = realValues[r]                          let  value = row[i]                          let  pair = {                 'title' : title,                 'value' : value             }             pairs.push(pair)         }         resultRows.push(pairs)         pairs = []     }     console .log(resultRows)               const  finalResult = JSON .stringify(resultRows);     fs.writeFile('data.json' , finalResult, (error) => {         if  (error) {             console .log(error)             throw  error         }         console .log('done!' )     })          await  browser.close(); })(); 
 
nodejs angularjs 範例 這是一個 angularjs 的範例 , 利用 window.angular.element(document.querySelector('body')).scope() 抓到 scope 然後就可以輕鬆拿 angularjs 裡面的變數 這種 binding 做的東東就算你直接塞數值進去 html 的 tag 裡面 , 也只是表象 最好還是要把數值塞進去真正變數裡面 , 另外這樣塞畫面上還是不會看到數值變化 所以還需要抓取 html tag 方便 debug
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 await page.evaluate(() => { 	//取得 angular 	let scope = window.angular.element(document.querySelector('body')).scope() 	//取得工時 	let hours = scope.form.workobj[0].hours 	//我的工作內容 	let genDesc = '執行爬蟲 (web crawler) 自動下載需求單之 excel 並 scan 檔案產生相對應 sql 及核對欄位 , 建立於 db'; 	//取得今日 	let theDate = new Date().toISOString().slice(0, 10).replace('-', '/').replace('-', '/') 	//設定 angular 實際值 	scope.form.workobj[0].workDesc = genDesc 	scope.form.workobj[0].actworkinghours = hours 	scope.form.workobj[0].launchDate = theDate 	scope.form.workobj[0].actcompletionDate = theDate 	//找到定位節點 	let workContentCell = Array.from(document.querySelectorAll('th')).find(ele => ele.textContent === '工作內容') 	//取得要簽核的 table 	let workTable = workContentCell.parentNode.parentNode 	//實際工作時數 	let workActTime = workTable.querySelectorAll('input')[0] 	//實際完成日期 	let workActDate = workTable.querySelectorAll('input')[1] 	//上線日期 	let workCompleteDate = workTable.querySelectorAll('input')[2] 	//工作內容簡述 	let workDesc = workTable.querySelector('textarea') 	//設定 html 元素上的數值 , 障眼法 	workActDate.value = theDate 	workCompleteDate.value = theDate 	workDesc.innerText = genDesc 	workActTime.value = hours 	//存檔 	document.querySelector('ul a:nth-child(1)').click() }); 
 
linux nodejs puppeteer 先確認 ubuntu 版本 , 我之前用 20.04 配的 nodejs 好像是 14.x , 爬蟲用的 puppeteer 需要 16.x
1 2 3 4 5 6 7 lsb_release -a No LSB modules are available. Distributor ID: Ubuntu Description:    Ubuntu 20.04.6 LTS Release:        20.04 Codename:       focal 
 
首先安裝 chrome
1 2 wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb sudo apt install ./google-chrome-stable_current_amd64.deb 
 
接著更新 nodejs 到 16.x 可以參考這裡  教學可以看這篇 
1 2 3 4 5 sudo apt update sudo apt upgrade curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash - sudo apt-get install -y nodejs node -v 
 
然後更新 npm 並且安裝 puppeteer
1 2 3 4 npm install -g npm@10.2.1 mkdir test cd test npm i puppeteer 
 
設定 package.json 的 "type": "module"
1 2 3 4 5 6 { 	"type" : "module" , 	"dependencies" : { 		"puppeteer" : "^21.4.1"  	} } 
 
最後可以跑這段他會 show 出 關於 Google 這裡如果你沒 GUI 一定要用 headless
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 import  puppeteer from  'puppeteer' ;import  * as  fs from  'fs' ;(async  () => { 	 	const  browser = await  puppeteer.launch({ 		 		headless: true , 		 		 		defaultViewport: null , 		args: ['--start-maximized' ] 	}); 	 	const  pages = await  browser.pages(); 	const  page = pages[0 ]; 	const  googleUrl = 'https://www.google.com.tw/?hl=zh_TW' ; 	await  page.goto(googleUrl); 	const  aboutSelector = '.MV3Tnb' ; 	await  page.waitForSelector(aboutSelector); 	const  text = await  page.evaluate(()  =>  { 		let  aTag = document .querySelector('.MV3Tnb' ); 		return  aTag.innerText; 	}); 	console .log(text);          await  browser.close(); })();