smol/examples/web-crawler.rs

81 lines
2.4 KiB
Rust
Raw Permalink Normal View History

2020-04-26 17:37:29 +00:00
//! Crawls the Rust language website and prints found pages.
//!
//! Run with:
//!
//! ```
//! cargo run --example web-crawler
//! ```
2020-04-01 19:40:27 +00:00
use std::collections::{HashSet, VecDeque};
use anyhow::Result;
2020-07-14 19:19:01 +00:00
use async_channel::{bounded, Sender};
2020-04-01 19:40:27 +00:00
use scraper::{Html, Selector};
const ROOT: &str = "https://www.rust-lang.org";
/// Fetches the HTML contents of a web page.
async fn fetch(url: String, sender: Sender<String>) {
let body = surf::get(&url).recv_string().await;
let body = body.unwrap_or_default();
2020-09-13 11:41:18 +00:00
sender.send(body).await.ok();
2020-04-01 19:40:27 +00:00
}
/// Extracts links from a HTML body.
fn links(body: String) -> Vec<String> {
let mut v = Vec::new();
for elem in Html::parse_fragment(&body).select(&Selector::parse("a").unwrap()) {
if let Some(link) = elem.value().attr("href") {
v.push(link.to_string());
}
}
v
}
fn main() -> Result<()> {
2020-08-26 21:59:49 +00:00
smol::block_on(async {
2020-04-01 19:40:27 +00:00
let mut seen = HashSet::new();
let mut queue = VecDeque::new();
seen.insert(ROOT.to_string());
queue.push_back(ROOT.to_string());
2020-07-14 19:19:01 +00:00
let (s, r) = bounded(200);
2020-04-01 19:40:27 +00:00
let mut tasks = 0;
2020-04-26 17:37:29 +00:00
// Loop while the queue is not empty or tasks are fetching pages.
2020-04-01 19:40:27 +00:00
while queue.len() + tasks > 0 {
2020-05-20 08:54:25 +00:00
// Limit the number of concurrent tasks.
while tasks < s.capacity().unwrap() {
2020-05-20 08:54:25 +00:00
// Process URLs in the queue and fetch more pages.
match queue.pop_front() {
None => break,
Some(url) => {
println!("{}", url);
tasks += 1;
2020-08-26 21:59:49 +00:00
smol::spawn(fetch(url, s.clone())).detach();
2020-05-20 08:54:25 +00:00
}
}
2020-04-01 19:40:27 +00:00
}
2020-04-26 17:37:29 +00:00
// Get a fetched web page.
let body = r.recv().await.unwrap();
2020-04-01 19:40:27 +00:00
tasks -= 1;
2020-04-26 17:37:29 +00:00
// Parse links in the web page and add them to the queue.
2020-04-01 19:40:27 +00:00
for mut url in links(body) {
2020-04-26 17:37:29 +00:00
// Add the site prefix if it's missing.
2020-04-24 07:06:35 +00:00
if url.starts_with('/') {
2020-04-01 19:40:27 +00:00
url = format!("{}{}", ROOT, url);
}
2020-04-26 17:37:29 +00:00
// If the URL makes sense and was not seen already, push it into the queue.
2020-04-01 19:40:27 +00:00
if url.starts_with(ROOT) && seen.insert(url.clone()) {
2020-04-24 07:06:35 +00:00
url = url.trim_end_matches('/').to_string();
2020-04-01 19:40:27 +00:00
queue.push_back(url);
}
}
}
Ok(())
})
}