smol/examples/web-crawler.rs

//! Crawls the Rust language website and prints found pages.
//!
//! Run with:
//!
//! ```
//! cargo run --example web-crawler
//! ```

use std::collections::{HashSet, VecDeque};

use anyhow::Result;
use async_channel::{bounded, Sender};
use scraper::{Html, Selector};

const ROOT: &str = "https://www.rust-lang.org";

/// Fetches the HTML contents of a web page.
async fn fetch(url: String, sender: Sender<String>) {
    let body = surf::get(&url).recv_string().await;
    let body = body.unwrap_or_default();
    sender.send(body).await.ok();
}

/// Extracts links from a HTML body.
fn links(body: String) -> Vec<String> {
    let mut v = Vec::new();
    for elem in Html::parse_fragment(&body).select(&Selector::parse("a").unwrap()) {
        if let Some(link) = elem.value().attr("href") {
            v.push(link.to_string());
        }
    }
    v
}

fn main() -> Result<()> {
    smol::block_on(async {
        let mut seen = HashSet::new();
        let mut queue = VecDeque::new();
        seen.insert(ROOT.to_string());
        queue.push_back(ROOT.to_string());

        let (s, r) = bounded(200);
        let mut tasks = 0;

        // Loop while the queue is not empty or tasks are fetching pages.
        while queue.len() + tasks > 0 {
            // Limit the number of concurrent tasks.
            while tasks < s.capacity().unwrap() {
                // Process URLs in the queue and fetch more pages.
                match queue.pop_front() {
                    None => break,
                    Some(url) => {
                        println!("{}", url);
                        tasks += 1;
                        smol::spawn(fetch(url, s.clone())).detach();
                    }
                }
            }

            // Get a fetched web page.
            let body = r.recv().await.unwrap();
            tasks -= 1;

            // Parse links in the web page and add them to the queue.
            for mut url in links(body) {
                // Add the site prefix if it's missing.
                if url.starts_with('/') {
                    url = format!("{}{}", ROOT, url);
                }

                // If the URL makes sense and was not seen already, push it into the queue.
                if url.starts_with(ROOT) && seen.insert(url.clone()) {
                    url = url.trim_end_matches('/').to_string();
                    queue.push_back(url);
                }
            }
        }
        Ok(())
    })
}
More docs and comments 2020-04-26 17:37:29 +00:00			`//! Crawls the Rust language website and prints found pages.`
			`//!`
			`//! Run with:`
			`//!`
			//! ```
			`//! cargo run --example web-crawler`
			//! ```

More and cleaner examples 2020-04-01 19:40:27 +00:00			`use std::collections::{HashSet, VecDeque};`

			`use anyhow::Result;`
Refactor 2020-07-14 19:19:01 +00:00			`use async_channel::{bounded, Sender};`
More and cleaner examples 2020-04-01 19:40:27 +00:00			`use scraper::{Html, Selector};`

			`const ROOT: &str = "https://www.rust-lang.org";`

			`/// Fetches the HTML contents of a web page.`
			`async fn fetch(url: String, sender: Sender<String>) {`
			`let body = surf::get(&url).recv_string().await;`
			`let body = body.unwrap_or_default();`
Remove let _ pattern 2020-09-13 11:41:18 +00:00			`sender.send(body).await.ok();`
More and cleaner examples 2020-04-01 19:40:27 +00:00			`}`

			`/// Extracts links from a HTML body.`
			`fn links(body: String) -> Vec<String> {`
			`let mut v = Vec::new();`
			`for elem in Html::parse_fragment(&body).select(&Selector::parse("a").unwrap()) {`
			`if let Some(link) = elem.value().attr("href") {`
			`v.push(link.to_string());`
			`}`
			`}`
			`v`
			`}`

			`fn main() -> Result<()> {`
Fix compilation errors 2020-08-26 21:59:49 +00:00			`smol::block_on(async {`
More and cleaner examples 2020-04-01 19:40:27 +00:00			`let mut seen = HashSet::new();`
			`let mut queue = VecDeque::new();`
			`seen.insert(ROOT.to_string());`
			`queue.push_back(ROOT.to_string());`

Refactor 2020-07-14 19:19:01 +00:00			`let (s, r) = bounded(200);`
More and cleaner examples 2020-04-01 19:40:27 +00:00			`let mut tasks = 0;`

More docs and comments 2020-04-26 17:37:29 +00:00			`// Loop while the queue is not empty or tasks are fetching pages.`
More and cleaner examples 2020-04-01 19:40:27 +00:00			`while queue.len() + tasks > 0 {`
Limit number of concurent tasks 2020-05-20 08:54:25 +00:00			`// Limit the number of concurrent tasks.`
change piper to async-channel on web-crawler - `piper` is deprecated so the crate need to be changed to `async-channel` 2020-06-27 08:36:57 +00:00			`while tasks < s.capacity().unwrap() {`
Limit number of concurent tasks 2020-05-20 08:54:25 +00:00			`// Process URLs in the queue and fetch more pages.`
			`match queue.pop_front() {`
			`None => break,`
			`Some(url) => {`
			`println!("{}", url);`
			`tasks += 1;`
Fix compilation errors 2020-08-26 21:59:49 +00:00			`smol::spawn(fetch(url, s.clone())).detach();`
Limit number of concurent tasks 2020-05-20 08:54:25 +00:00			`}`
			`}`
More and cleaner examples 2020-04-01 19:40:27 +00:00			`}`

More docs and comments 2020-04-26 17:37:29 +00:00			`// Get a fetched web page.`
			`let body = r.recv().await.unwrap();`
More and cleaner examples 2020-04-01 19:40:27 +00:00			`tasks -= 1;`

More docs and comments 2020-04-26 17:37:29 +00:00			`// Parse links in the web page and add them to the queue.`
More and cleaner examples 2020-04-01 19:40:27 +00:00			`for mut url in links(body) {`
More docs and comments 2020-04-26 17:37:29 +00:00			`// Add the site prefix if it's missing.`
fix clippy lints 2020-04-24 07:06:35 +00:00			`if url.starts_with('/') {`
More and cleaner examples 2020-04-01 19:40:27 +00:00			`url = format!("{}{}", ROOT, url);`
			`}`
More docs and comments 2020-04-26 17:37:29 +00:00
			`// If the URL makes sense and was not seen already, push it into the queue.`
More and cleaner examples 2020-04-01 19:40:27 +00:00			`if url.starts_with(ROOT) && seen.insert(url.clone()) {`
fix clippy lints 2020-04-24 07:06:35 +00:00			`url = url.trim_end_matches('/').to_string();`
More and cleaner examples 2020-04-01 19:40:27 +00:00			`queue.push_back(url);`
			`}`
			`}`
			`}`
			`Ok(())`
			`})`
			`}`