public static class HttpHelper {<br> public const string UserAgent =<br> "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36";<br><br> public static HttpClientHandler Handler { get; }<br><br> public static HttpClient Client { get; }<br><br> static HttpHelper() {<br> Handler = new HttpClientHandler();<br> Client = new HttpClient(Handler);<br> Client.DefaultRequestHeaders.Add("User-Agent", UserAgent);<br> }<br><br> public static async Task<IHtmlDocument> GetHtmlDocument(string url) {<br> var html = await Client.GetStringAsync(url);<br> // todo 这个用法有内存泄漏问题,得优化一下<br> return new HtmlParser().ParseDocument(html);<br> }<br><br> public static async Task<IHtmlDocument> GetHtmlDocument(string url, string charset) {<br> var res = await Client.GetAsync(url);<br> var resBytes = await res.Content.ReadAsByteArrayAsync();<br> var resStr = Encoding.GetEncoding(charset).GetString(resBytes);<br> // todo 这个用法有内存泄漏问题,得优化一下<br> return new HtmlParser().ParseDocument(resStr);<br> }<br>}<br>
复制代码
这段代码里面有俩 todo ,这个内存泄漏的问题在简单的爬虫中影响不大,所以后面有大规模的需求再来优化吧~
4搞HTML
var data = await HttpHelper.GetHtmlDocument(url);<br>foreach (var item in data.QuerySelectorAll(".pagew li")) {<br> var link = item.QuerySelector("a");<br> var href = link?.GetAttribute("href");<br> if (href != null) await CrawlItem(href);<br>}<br>
复制代码
或者结合正则表达式
var data = await HttpHelper.GetHtmlDocument(url);<br>var page = data.QuerySelector(".pageinfo");<br>Console.WriteLine("拿到分页信息:{0}", page?.TextContent);<br>var match = Regex.Match(page?.TextContent ?? "", @"共\s(\d+)页(\d+)条");<br>var pageCount = int.Parse(match.Groups[1].Value);<br>for (int i = 1; i <= pageCount; i++) {<br> await CrawlPage(i);<br>}<br>
复制代码
使用方法依然是一行代码
var jsonOption = new JsonSerializerOptions {<br> WriteIndented = true,<br> Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping<br>};<br>