Skip to content

Commit

Permalink
feat: add global state and parse url
Browse files Browse the repository at this point in the history
  • Loading branch information
baerwang committed Jan 13, 2024
1 parent 8fff388 commit a6fddd8
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 15 deletions.
31 changes: 31 additions & 0 deletions src/channel/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
use headless_chrome::Browser;
use tokio::sync::mpsc::Sender;

use crate::model::task::TaskConfig;

pub struct GlobalState {
pub domain: String,
pub browser: Browser,
pub config: TaskConfig,

pub sender: Option<Sender<String>>,
}

impl GlobalState {
pub fn new(tx: Sender<String>, domain: String, browser: Browser, config: TaskConfig) -> Self {
GlobalState {
domain,
browser,
config,
sender: Some(tx),
}
}

pub fn send_message(&self, message: &str) {
if let Some(ref sender) = self.sender {
if sender.blocking_send(message.to_owned()).is_err() {
log::error!("Failed to send URL through channel");
}
}
}
}
10 changes: 8 additions & 2 deletions src/cli/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use crate::cli::args;
use crate::handler::crawler;
use crate::handler::robots::robots;
use crate::handler::sitemap::sitemap;
use crate::{common, handler, model};
use crate::{channel, common, handler, model};

pub async fn cli() -> Result<(), Box<dyn std::error::Error>> {
let app = args::CLi::parse();
Expand Down Expand Up @@ -115,9 +115,15 @@ pub async fn cli() -> Result<(), Box<dyn std::error::Error>> {

let set: DashSet<String> = DashSet::new();
let browser = Browser::new(launch_options)?;
let state = channel::GlobalState::new(
tx.clone(),
Arc::new(app.target[0].clone()).clone().to_string(),
browser,
config,
);
while let Some(url) = rx.recv().await {
if set.insert(url.clone()) {
_ = crawler::tasks(url.clone().as_str(), tx.clone(), browser.clone(), &config);
_ = crawler::tasks(url.clone().as_str(), tx.clone(), &state);
} else {
println!("Value {} already exists", url.clone());
}
Expand Down
32 changes: 27 additions & 5 deletions src/handler/collect.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ use std::collections::HashSet;
use std::error::Error;
use std::sync::Arc;

use crate::common::filter::matching_filter;
use headless_chrome::Tab;

use crate::channel;
use crate::common::filter::matching_filter;
use crate::common::util;

const JS_HREF: &str = r#"
Expand Down Expand Up @@ -40,20 +41,41 @@ const JS_OBJECT: &str = r#"
list
"#;

pub fn collect(tab: &Arc<Tab>) {
_ = query_selector_all(tab, JS_HREF);
_ = query_selector_all(tab, JS_OBJECT);
pub fn collect(state: &channel::GlobalState, tab: &Arc<Tab>) {
_ = query_selector_all(state, tab, JS_HREF);
_ = query_selector_all(state, tab, JS_OBJECT);
}

fn query_selector_all(tab: &Arc<Tab>, v: &str) -> Result<HashSet<String>, Box<dyn Error>> {
fn query_selector_all(
state: &channel::GlobalState,
tab: &Arc<Tab>,
v: &str,
) -> Result<HashSet<String>, Box<dyn Error>> {
let result = tab.call_method(util::evaluate(v))?;
if let Some(result_value) = result.result.value {
return Ok(
serde_json::from_str::<HashSet<String>>(&result_value.to_string())?
.into_iter()
.filter(|s| matching_filter(s))
.map(|v| parse_url(&state.domain, v))
.collect(),
);
}
Ok(HashSet::new())
}

fn parse_url(root: &String, child: String) -> String {
match child {
_ if child.starts_with("http://") || child.starts_with("https://") => child,
_ => {
if child.contains("../") {
let mut tmp = child.replace("../", "");
if !tmp.starts_with('/') {
tmp = format!("_{}", tmp);
}
return tmp;
}
format!("{}/{}", root, child)
}
}
}
16 changes: 8 additions & 8 deletions src/handler/crawler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,25 @@ use headless_chrome::protocol::cdp::Page::{
SetDownloadBehaviorBehaviorOption,
};
use headless_chrome::protocol::cdp::Runtime::AddBinding;
use headless_chrome::{Browser, Tab};
use headless_chrome::Tab;
use tokio::sync::mpsc;

use crate::common::util;
use crate::handler::collect::collect;
use crate::handler::form::{Html, FORM};
use crate::handler::form_js::{JS_CODE, TAB_INIT};
use crate::{common, model};
use crate::{channel, common};

pub fn tasks(
url: &str,
tx: mpsc::Sender<String>,
browser: Browser,
config: &model::task::TaskConfig,
state: &channel::GlobalState,
) -> Result<(), Box<dyn std::error::Error>> {
let random_ug = common::user_agent::random_user_agent();
let tab = browser.new_tab()?;
let tab = state.browser.new_tab()?;
tab.enable_runtime()?;
tab.enable_fetch(None, Some(true))?;
tab.authenticate(config.username.clone(), config.password.clone())?;
tab.authenticate(state.config.username.clone(), state.config.password.clone())?;
tab.set_user_agent(random_ug.as_str(), None, None).unwrap();
tab.call_method(add_binding("addLink"))?;
tab.call_method(add_binding("Test"))?;
Expand All @@ -36,7 +35,8 @@ pub fn tasks(
include_command_line_api: None,
})?;
tab.set_extra_http_headers(
config
state
.config
.headers
.iter()
.map(|(k, v)| (k.as_str(), v.as_str()))
Expand All @@ -63,7 +63,7 @@ pub fn tasks(
}
}
}
collect(&tab);
collect(state, &tab);
_ = tab.close(true);

Ok(())
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub mod channel;
pub mod cli;
pub mod common;
pub mod handler;
Expand Down

0 comments on commit a6fddd8

Please sign in to comment.