Skip to content

Commit

Permalink
feat: browser add headers
Browse files Browse the repository at this point in the history
  • Loading branch information
baerwang committed Dec 27, 2023
1 parent d7cadb0 commit 266d13a
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 19 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Daemon is a browser crawler that does URL harvesting in chrome headless mode

- [ ] Robots
- [ ] Javascript Content
- [ ] Custom Headers
- [x] Custom Headers
- [x] Proxy
- [ ] Form
- [ ] Click
Expand Down
63 changes: 57 additions & 6 deletions src/cli/cmd.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,32 @@
use std::collections::HashMap;

use anyhow::anyhow;
use headless_chrome::browser::default_executable;
use headless_chrome::LaunchOptions;

use crate::handler::crawler;
use crate::model;

pub fn demon_args() -> impl IntoIterator<Item = impl Into<clap::Arg>> {
[
clap::Arg::new("target")
.long("target")
.alias("target")
.action(clap::ArgAction::Set)
.num_args(0..)
.help("Custom Http Headers"),
clap::Arg::new("custom-headers")
.long("custom-headers")
.alias("custom-headers")
.action(clap::ArgAction::Set)
.num_args(0..)
.help("Custom Http Headers"),
]
}

pub fn chromium_args() -> impl IntoIterator<Item = impl Into<clap::Arg>> {
[
clap::Arg::new("path").long("chromium-path").alias("path")
clap::Arg::new("path").long("path").alias("path")
.help("Path for Chrome or Chromium."),
clap::Arg::new("headless").long("headless").alias("headless")
.default_value("true")
Expand Down Expand Up @@ -37,6 +57,7 @@ pub fn cli() -> Result<(), Box<dyn std::error::Error>> {
let app = clap::Command::new("demon")
.version(clap::crate_version!())
.about(clap::crate_description!())
.args(demon_args())
.subcommands(&[
clap::Command::new("chromium")
.args(chromium_args())
Expand All @@ -45,23 +66,53 @@ pub fn cli() -> Result<(), Box<dyn std::error::Error>> {
])
.get_matches();

let headers: HashMap<_, _> = app
.get_many::<String>("custom-headers")
.unwrap_or_default()
.map(|pair| {
let mut iter = pair.split(':');
let key = iter.next().expect("No key found");
let value = iter.next().expect("No value found");
(key.to_string(), value.to_string())
})
.collect();

let chromium_path = Some(default_executable().map_err(|e| anyhow!(e))?);

let target = app
.get_many::<String>("target")
.expect("target not allow empty")
.map(|s| s.to_string())
.collect();

let config = model::task::TaskConfig {
target,
headers,
robots: false,
range: 0,
repeat: 0,
};

env_logger::init_from_env(env_logger::Env::new().default_filter_or("INFO"));

let buf = std::env::current_dir()
.unwrap()
.join("files/user_agent.toml");
std::env::set_var("user_agent", buf);

if app.subcommand().is_none() {
return crawler::browse_wikipedia(LaunchOptions::default());
let launch_options = LaunchOptions::default_builder()
.path(chromium_path)
.build()?;
return crawler::browse_wikipedia(config, launch_options);
}

env_logger::init_from_env(env_logger::Env::new().default_filter_or("INFO"));

let (name, command) = app.subcommand().unwrap();
match name {
"chromium" => {
let path = match command.get_one::<String>("path") {
Some(h) => Some(std::path::PathBuf::from(h.parse::<String>().unwrap())),
None => Some(default_executable().map_err(|e| anyhow!(e))?),
None => chromium_path,
};

let proxy = command.get_one::<String>("proxy").map(|h| h.as_str());
Expand All @@ -82,7 +133,7 @@ pub fn cli() -> Result<(), Box<dyn std::error::Error>> {
.user_data_dir(user_data_dir)
.build()?;

crawler::browse_wikipedia(launch_options)
crawler::browse_wikipedia(config, launch_options)
}
_ => {
panic!("The current feature is not implemented or {name} does not exist")
Expand Down
38 changes: 26 additions & 12 deletions src/handler/crawler.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,33 @@
use headless_chrome::{Browser, LaunchOptions};

use crate::common;
use crate::{common, model};

pub fn browse_wikipedia(launch_options: LaunchOptions) -> Result<(), Box<dyn std::error::Error>> {
pub fn browse_wikipedia(
config: model::task::TaskConfig,
launch_options: LaunchOptions,
) -> Result<(), Box<dyn std::error::Error>> {
let browser = Browser::new(launch_options)?;
let tab = browser.new_tab()?;
let random_ug = common::user_agent::random_user_agent();
tab.set_user_agent(random_ug.as_str(), None, None).unwrap();
tab.navigate_to("https://example.com")?;
let h1 = tab.wait_for_xpath("/html/body/div/h1")?;
assert_eq!(h1.get_inner_text().unwrap().as_str(), "Example Domain");
let ug = tab
.evaluate("window.navigator.userAgent", false)?
.value
for item in &config.target {
let tab = browser.new_tab()?;
let random_ug = common::user_agent::random_user_agent();
tab.set_user_agent(random_ug.as_str(), None, None).unwrap();
tab.navigate_to(item)?;
tab.set_extra_http_headers(
config
.headers
.iter()
.map(|(k, v)| (k.as_str(), v.as_str()))
.collect(),
)
.unwrap();
assert_eq!(random_ug, ug);
let h1 = tab.wait_for_xpath("/html/body/div/h1")?;
assert_eq!(h1.get_inner_text().unwrap().as_str(), "Example Domain");
let ug = tab
.evaluate("window.navigator.userAgent", false)?
.value
.unwrap();
assert_eq!(random_ug, ug);
}

Ok(())
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub mod cli;
pub mod common;
pub mod handler;
pub mod model;
1 change: 1 addition & 0 deletions src/model/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod task;
9 changes: 9 additions & 0 deletions src/model/task.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
use std::collections::HashMap;

pub struct TaskConfig {
pub target: Vec<String>,
pub headers: HashMap<String, String>,
pub robots: bool,
pub range: i8,
pub repeat: i8,
}

0 comments on commit 266d13a

Please sign in to comment.