用 ScrapySharp 并行下载天涯图片

用 ScrapySharp 并行下载天涯图片


#r "HtmlAgilityPack.dll"
#r "ScrapySharp.dll"

open System
open System.Threading.Tasks
open HtmlAgilityPack
open ScrapySharp.Extensions


let url  = "http://bbs.tianya.cn/post-12-563201-1.shtml"
let web  = new ScrapySharp.Network.ScrapingBrowser()
let html = web.DownloadString(new Uri(url))


let doc = new HtmlAgilityPack.HtmlDocument()
doc.LoadHtml( html )


let urls = 
    doc.DocumentNode.CssSelect("div.bbs-content > img")
    |> Seq.map(fun i -> i.GetAttributeValue("original"))


let urls = [ "http://img3.laibafile.cn/p/m/166829011.jpg";
             "http://img3.laibafile.cn/p/m/166829027.jpg";
             "http://img3.laibafile.cn/p/m/166829000.jpg";
             "http://img3.laibafile.cn/p/m/166829039.jpg";
             "http://img3.laibafile.cn/p/m/166829034.jpg";
             "http://img3.laibafile.cn/p/m/166829030.jpg";
             "http://img3.laibafile.cn/p/m/166829016.jpg";
             "http://img3.laibafile.cn/p/m/166829024.jpg"  ]


let GetPicture (filePath: string)  (url: string) =
    let path = filePath.Substring(0, filePath.LastIndexOf("."))
    let ty = 
        let t = new Uri( url )
        match t.Authority with 
        | var when var.Contains("laibafile.cn") -> "http://bbs.tianya.cn"
        | var when var.Contains("tianya.cn")    -> "http://bbs.tianya.cn"
        | _  -> t.Scheme + "://" + t.Authority


    let web = new ScrapySharp.Network.ScrapingBrowser()
    web.NavigateToPage( new Uri(ty)) |> ignore


    if not( IO.Directory.Exists( path ) ) then IO.Directory.CreateDirectory( path ) |> ignore
    let file = url.Substring( url.LastIndexOf("/") )
    let pic = (web.NavigateToPage( new Uri( url ))).RawResponse.Body
    printfn "%s" url
    File.WriteAllBytes( ( path + file ), pic )


let outPic = GetPicture filePath


Parallel.ForEach(urls, outPic) |> ignore

用 ScrapySharp 并行下载天涯图片

上一篇:修改linux系统的root密码


下一篇:uva 11572 - Unique Snowflakes(Towpointer)