用 ScrapySharp 并行下载天涯图片

2023-12-16 16:49:46

用 ScrapySharp 并行下载天涯图片

#r "HtmlAgilityPack.dll"
#r "ScrapySharp.dll"

open System
open System.Threading.Tasks
open HtmlAgilityPack
open ScrapySharp.Extensions

let url = "http://bbs.tianya.cn/post-12-563201-1.shtml"
let web = new ScrapySharp.Network.ScrapingBrowser()
let html = web.DownloadString(new Uri(url))

let doc = new HtmlAgilityPack.HtmlDocument()
doc.LoadHtml( html )

let urls =
doc.DocumentNode.CssSelect("div.bbs-content > img")
|> Seq.map(fun i -> i.GetAttributeValue("original"))

let urls = [ "http://img3.laibafile.cn/p/m/166829011.jpg";
"http://img3.laibafile.cn/p/m/166829027.jpg";
"http://img3.laibafile.cn/p/m/166829000.jpg";
"http://img3.laibafile.cn/p/m/166829039.jpg";
"http://img3.laibafile.cn/p/m/166829034.jpg";
"http://img3.laibafile.cn/p/m/166829030.jpg";
"http://img3.laibafile.cn/p/m/166829016.jpg";
"http://img3.laibafile.cn/p/m/166829024.jpg" ]

let GetPicture (filePath: string) (url: string) =
let path = filePath.Substring(0, filePath.LastIndexOf("."))
let ty =
let t = new Uri( url )
match t.Authority with
| var when var.Contains("laibafile.cn") -> "http://bbs.tianya.cn"
| var when var.Contains("tianya.cn") -> "http://bbs.tianya.cn"
| _ -> t.Scheme + "://" + t.Authority

let web = new ScrapySharp.Network.ScrapingBrowser()
web.NavigateToPage( new Uri(ty)) |> ignore

if not( IO.Directory.Exists( path ) ) then IO.Directory.CreateDirectory( path ) |> ignore
let file = url.Substring( url.LastIndexOf("/") )
let pic = (web.NavigateToPage( new Uri( url ))).RawResponse.Body
printfn "%s" url
File.WriteAllBytes( ( path + file ), pic )

let outPic = GetPicture filePath

Parallel.ForEach(urls, outPic) |> ignore

码农公寓

相关文章