0%

.net core 畜牲爬蟲

 

整理資料無意間翻到以前做過,後來卻沒用到的爬蟲API
主要使用套件 anglesharp
爬蟲目標網站
寫得比較不好的部分就是沒把WebClient換成HttpClient並且用DI方式注入,我就懶,有機會再改寫

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
using System;
using System.Collections.Generic;
using System.Collections.Specialized;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using AngleSharp;
using AngleSharp.Html.Dom;
using Demo.Models;
using CsvHelper;
using Microsoft.AspNetCore.Mvc;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;

namespace Demo.Controllers
{
[Route("api/[controller]")]
[ApiController]
public class CowController : ControllerBase
{
/// <summary>
/// 取得行政院農委會肉牛產地行情價格(月報表)
/// </summary>
/// <param name="beginYear">起始年(必須為4碼,自2010開始至2019) ex:2019</param>
/// <param name="beginMonth">起始月(必須為1或2碼 , 1~12) ex:9</param>
/// <param name="endYear">終止年(必須為4碼,自2010開始至2019) ex:2019</param>
/// <param name="endMonth">終止月(必須為1或2碼 , 1~12) ex:9</param>
/// <returns>回傳牛隻價格json陣列
/// [
/// {"時間":"2019","閹公牛":"140","肥育肉用母牛":"138","肥育乳公牛550公斤以上":"118","週齡仔公牛(隻)":"1,998","乳公架仔牛150-200公斤":"102"}
/// ]
/// </returns>
[HttpGet]
[Route("month")]
public async Task<IActionResult> Month(string beginYear, string beginMonth,
string endYear, string endMonth)
{
//anglesharp
//https://anglesharp.github.io/

var context = BrowsingContext.New(AngleSharp.Configuration.Default.WithDefaultLoader());

//畜生網站
var url = "http://ppg.naif.org.tw/naif/MarketInformation/Cattle/twStatistics.aspx";

//載入畜生網站
var queryDocument = await context.OpenAsync(url);

//取得asp.net 自動產生的 viewstate 及其他特殊變數
var viewstate = queryDocument.QuerySelector("#__VIEWSTATE");
var viewstategenerator = queryDocument.QuerySelector("#__VIEWSTATEGENERATOR");
var eventvalidation = queryDocument.QuerySelector("#__EVENTVALIDATION");

var viewstateVal = viewstate.GetAttribute("value");
var viewstategeneratorVal = viewstategenerator.GetAttribute("value");
var eventvalidationVal = eventvalidation.GetAttribute("value");


//參考自保哥
//https://blog.miniasp.com/post/2010/01/23/Emulate-Form-POST-with-WebClient-class
using (WebClient wc = new WebClient())
{
try
{
wc.Encoding = Encoding.UTF8;

//傳送參數
NameValueCollection dict = new NameValueCollection();

dict["__VIEWSTATE"] = viewstateVal;
dict["__VIEWSTATEGENERATOR"] = viewstategeneratorVal;
dict["__EVENTVALIDATION"] = eventvalidationVal;

//週統計參數(時間必須是週一)懶得寫
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$TextBox_d_beg"] = "2019-10-01";
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$TextBox_d_end"] = "2019-10-01";

//統計類型(週,月,年)
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$time"] = "RadioButton_m";

//月起始
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$DropDownList_m_begYear"] = beginYear;
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$DropDownList_m_begMonth"] = beginMonth;

//月終止
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$DropDownList_m_endYear"] = endYear;
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$DropDownList_m_endMonth"] = endMonth;

//年
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$DropDownList_y_beg"] = "2019";
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$DropDownList_y_end"] = "2019";

//查詢按鈕
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$Button_query"] = @"查詢";


byte[] bResult = wc.UploadValues(url, dict);

string resultHtml = Encoding.UTF8.GetString(bResult);

//Console.WriteLine(resultHtml);

var resultDocument = await context.OpenAsync(req => req.Content(resultHtml));
var table = resultDocument.QuerySelector("#ContentPlaceHolder_contant_ContentPlaceHolder_contant_Panel_data > table") as IHtmlTableElement;


List<Dictionary<string, string>> cows =
new List<Dictionary<string, string>>();

//第一行為header所以跳過
int counter = 0;
foreach (IHtmlTableRowElement tr in table.Rows)
{
if (counter > 0)
{
var cow = new Dictionary<string, string> {
{ "時間" , tr.Cells[0].TextContent },
{ "閹公牛" , tr.Cells[1].TextContent },
{ "肥育肉用母牛" , tr.Cells[2].TextContent },
{ "肥育乳公牛550公斤以上" , tr.Cells[3].TextContent },
{ "週齡仔公牛(隻)" , tr.Cells[4].TextContent },
{ "乳公架仔牛150-200公斤" , tr.Cells[5].TextContent },
};
cows.Add(cow);
}
counter++;
}

return Ok(cows);

}
catch (WebException ex)
{
throw new Exception("無法連接遠端伺服器");
}
}
}

/// <summary>
/// 取得行政院農委會肉牛產地行情價格(年報表)
/// </summary>
/// <param name="beginYear">起始年(必須為4碼,自2010開始至2019) ex:2019</param>
/// <param name="endYear">終止年(必須為4碼,自2010開始至2019) ex:2019</param>
/// <returns>
/// 回傳牛隻價格json陣列
/// [{"時間":"2019","閹公牛":"140","肥育肉用母牛":"138","肥育乳公牛550公斤以上":"118","週齡仔公牛(隻)":"1,998","乳公架仔牛150-200公斤":"102"},{"時間":"平均","閹公牛":"140","肥育肉用母牛":"138","肥育乳公牛550公斤以上":"118","週齡仔公牛(隻)":"1,998","乳公架仔牛150-200公斤":"102"}]
/// </returns>
[HttpGet]
[Route("year")]
public async Task<IActionResult> Year(string beginYear, string endYear)
{
//anglesharp
//https://anglesharp.github.io/

var context = BrowsingContext.New(AngleSharp.Configuration.Default.WithDefaultLoader());

//畜生網站
var url = "http://ppg.naif.org.tw/naif/MarketInformation/Cattle/twStatistics.aspx";

//載入畜生網站
var queryDocument = await context.OpenAsync(url);

//取得asp.net 自動產生的 viewstate 及其他特殊變數
var viewstate = queryDocument.QuerySelector("#__VIEWSTATE");
var viewstategenerator = queryDocument.QuerySelector("#__VIEWSTATEGENERATOR");
var eventvalidation = queryDocument.QuerySelector("#__EVENTVALIDATION");

var viewstateVal = viewstate.GetAttribute("value");
var viewstategeneratorVal = viewstategenerator.GetAttribute("value");
var eventvalidationVal = eventvalidation.GetAttribute("value");


//參考自保哥
//https://blog.miniasp.com/post/2010/01/23/Emulate-Form-POST-with-WebClient-class
using (WebClient wc = new WebClient())
{
try
{
wc.Encoding = Encoding.UTF8;

//傳送參數
NameValueCollection dict = new NameValueCollection();

dict["__VIEWSTATE"] = viewstateVal;
dict["__VIEWSTATEGENERATOR"] = viewstategeneratorVal;
dict["__EVENTVALIDATION"] = eventvalidationVal;

//週統計參數(時間必須是週一)懶得寫
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$TextBox_d_beg"] = "2019-10-01";
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$TextBox_d_end"] = "2019-10-01";

//統計類型(週,月,年)
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$time"] = "RadioButton_y";

//月起始
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$DropDownList_m_begYear"] = "2019";
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$DropDownList_m_begMonth"] = "1";

//月終止
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$DropDownList_m_endYear"] = "2019";
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$DropDownList_m_endMonth"] = "1";

//年
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$DropDownList_y_beg"] = beginYear;
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$DropDownList_y_end"] = endYear;

//查詢按鈕
dict["ctl00$ctl00$ContentPlaceHolder_contant$ContentPlaceHolder_contant$Button_query"] = @"查詢";


byte[] bResult = wc.UploadValues(url, dict);

string resultHtml = Encoding.UTF8.GetString(bResult);

//Console.WriteLine(resultHtml);

var resultDocument = await context.OpenAsync(req => req.Content(resultHtml));
var table = resultDocument.QuerySelector("#ContentPlaceHolder_contant_ContentPlaceHolder_contant_Panel_data > table") as IHtmlTableElement;


List<Dictionary<string, string>> cows =
new List<Dictionary<string, string>>();

//第一行為header所以跳過
int counter = 0;
foreach (IHtmlTableRowElement tr in table.Rows)
{
if (counter > 0)
{
var cow = new Dictionary<string, string> {
{ "時間" , tr.Cells[0].TextContent },
{ "閹公牛" , tr.Cells[1].TextContent },
{ "肥育肉用母牛" , tr.Cells[2].TextContent },
{ "肥育乳公牛550公斤以上" , tr.Cells[3].TextContent },
{ "週齡仔公牛(隻)" , tr.Cells[4].TextContent },
{ "乳公架仔牛150-200公斤" , tr.Cells[5].TextContent },
};
cows.Add(cow);
}
counter++;
}

return Ok(cows);

}
catch (WebException ex)
{
throw new Exception("無法連接遠端伺服器");
}
}
}
}

}
關閉