, 4 .
EDIT: , , . , , , , , 1.9.6.16:
open System.Collections.Generic
open System.Net
open System.IO
open System.Threading
open System.Text.RegularExpressions
let limit = 10
let linkPat = "href=\s*\"[^\"h]*(http://[^&\"]*)\""
let getLinks (txt:string) =
[ for m in Regex.Matches(txt,linkPat) -> m.Groups.Item(1).Value ]
let (<--) (mp: MailboxProcessor<_>) x = mp.Post(x)
type RequestGate(n:int) =
let semaphore = new Semaphore(initialCount=n,maximumCount=n)
member x.AcquireAsync(?timeout) =
async { let! ok = semaphore.AsyncWaitOne(?millisecondsTimeout=timeout)
if ok then
return
{ new System.IDisposable with
member x.Dispose() =
semaphore.Release() |> ignore }
else
return! failwith "couldn't acquire a semaphore" }
let webRequestGate = RequestGate(5)
let collectLinks (url:string) =
async {
let! html =
async {
use! holder = webRequestGate.AcquireAsync()
let req = WebRequest.Create(url,Timeout=5)
use! response = req.AsyncGetResponse()
use reader = new StreamReader(
response.GetResponseStream())
return! reader.AsyncReadToEnd() }
let links = getLinks html
do printfn "finished reading %s, got %d links"
url (List.length links)
return links }
let urlCollector =
MailboxProcessor.Start(fun self ->
let rec waitForUrl (visited : Set<string>) =
async { if visited.Count < limit then
let! url = self.Receive()
if not (visited.Contains(url)) then
Async.Start
(async { let! links = collectLinks url
for link in links do
do self <-- link })
return! waitForUrl(visited.Add(url)) }
waitForUrl(Set.Empty))
urlCollector <-- "http:
System.Console.ReadKey() |> ignore