Skip to content

Commit

Permalink
feat: support robots
Browse files Browse the repository at this point in the history
  • Loading branch information
baerwang committed Dec 28, 2023
1 parent 6cc8735 commit 262449b
Show file tree
Hide file tree
Showing 8 changed files with 706 additions and 7 deletions.
650 changes: 648 additions & 2 deletions Cargo.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,4 @@ log = "0.4.20"
toml = "0.8.8"
rand = "0.8.5"
once_cell = "1.19.0"
reqwest = { version = "0.11.23", features = ["blocking", "json"] }
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Daemon is a browser crawler that does URL harvesting in chrome headless mode

## Roadmap

- [ ] Robots
- [x] Robots
- [ ] Javascript Content
- [x] Custom Headers
- [x] Proxy
Expand Down
1 change: 0 additions & 1 deletion src/cli/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ pub fn cli() -> Result<(), Box<dyn std::error::Error>> {
let proxy = Some(c.proxy.as_deref().unwrap_or_default());
let launch_options = LaunchOptions::default_builder()
.path(chromium_path)
.proxy_server(None)
.headless(c.headless)
.sandbox(c.sandbox)
.proxy_server(proxy)
Expand Down
3 changes: 1 addition & 2 deletions src/handler/crawler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,12 @@ pub fn browse_wikipedia(
.collect(),
)
.unwrap();
let h1 = tab.wait_for_xpath("/html/body/div/h1")?;
assert_eq!(h1.get_inner_text().unwrap().as_str(), "Example Domain");
let ug = tab
.evaluate("window.navigator.userAgent", false)?
.value
.unwrap();
assert_eq!(random_ug, ug);
_ = tab.close(true);
}

Ok(())
Expand Down
1 change: 1 addition & 0 deletions src/handler/mod.rs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pub mod crawler;
pub mod robots;
35 changes: 35 additions & 0 deletions src/handler/robots.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
use std::collections::HashSet;
use std::time;

use reqwest::header::{HeaderMap, USER_AGENT};

use crate::common;

pub fn parse_robots(site: String) -> Result<HashSet<String>, Box<dyn std::error::Error>> {
let site = site + "/robots.txt";

let mut headers = HeaderMap::new();
headers.insert(
USER_AGENT,
common::user_agent::random_user_agent().parse().unwrap(),
);

let rsp = reqwest::blocking::Client::new()
.get(site)
.timeout(time::Duration::from_secs(5))
.headers(headers)
.send();
assert_eq!(rsp.as_ref().unwrap().status(), 200);
let txt = rsp?.text()?;
let allow_values: HashSet<String> = txt
.lines()
.flat_map(|line| {
let parts: Vec<&str> = line.split_whitespace().collect();
match parts.len() {
2 if parts[0] == "Allow:" || parts[0] == "Disallow:" => Some(parts[1].to_string()),
_ => None,
}
})
.collect();
Ok(allow_values)
}
20 changes: 19 additions & 1 deletion tests/integration_test.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,23 @@
#[cfg(test)]
mod tests {
use demon::handler;
use handler::robots::parse_robots;

fn load() {
let buf = std::env::current_dir()
.unwrap()
.join("files/user_agent.toml");
std::env::set_var("user_agent", buf);
}

#[test]
fn cli_test() {}
fn parse_robots_test() {
load();
assert_ne!(
parse_robots("https://www.dvwa.co.uk".to_string())
.unwrap()
.len(),
0
)
}
}

0 comments on commit 262449b

Please sign in to comment.