Skip to content

Commit

Permalink
Add file checksum fetch caching
Browse files Browse the repository at this point in the history
  • Loading branch information
alexheretic committed Feb 15, 2024
1 parent 3ba8e55 commit db6aede
Show file tree
Hide file tree
Showing 8 changed files with 195 additions and 50 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- Add info logs for release & metadata fetch latency.
- When fetching all releases handle 429 by backing off.
- Improve fetch error logging.
- Add file checksum fetch caching controlled by `cache-releases-older-than` config.

# v0.1.4

Expand Down
27 changes: 27 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
name = "gitlab-cargo-shim"
version = "0.1.4"
edition = "2021"
authors = [
"Jordan Doyle <jordan@doyl.ee>"
]
authors = ["Jordan Doyle <jordan@doyl.ee>"]

[dependencies]
anyhow = "1"
Expand All @@ -17,6 +15,7 @@ cargo-platform = "0.1"
clap = { version = "4", features = ["derive", "cargo", "wrap_help"] }
futures = "0.3"
hex = "0.4"
humantime-serde = "1.1.1"
indexmap = "2"
indoc = "2.0"
itoa = "1.0"
Expand All @@ -26,9 +25,10 @@ parse_link_header = "0.3"
percent-encoding = "2.3"
reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] }
semver = "1.0"
serde = { version = "1.0", features = ["derive"] }
serde = { version = "1.0", features = ["derive", "rc"] }
serde_json = "1"
shlex = "1.1"
smol_str = { version = "0.2.1", features = ["serde"] }
thrussh = "0.34"
thrussh-keys = "0.22"
thrussh-libsodium = "=0.2.1" # 0.2.2 causes dynamic linking by enabling use-pkg-config
Expand Down
9 changes: 9 additions & 0 deletions config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,12 @@ uri = "http://127.0.0.1:3000"
## The correct format must be available in the package registry for all
## packages.
# metadata-format = "json"

## Cache file checksum fetches for all release older than this value.
##
## If omitted no caching will occur.
##
## Note: Caching shouldn't be used if published releases are expected to be mutated.
## However, a grace period can allow the majority of crates to benefit from caching
## but handle mutation of recently published crates.
# cache-releases-older-than = "2 days"
42 changes: 37 additions & 5 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
use crate::providers::gitlab::handle_error;
use clap::Parser;
use serde::{de::DeserializeOwned, Deserialize};
use std::{io, net::SocketAddr, path::PathBuf, str::FromStr};
use time::Duration;
use std::{io, net::SocketAddr, path::PathBuf, str::FromStr, time::Duration};
use url::Url;

#[derive(Parser)]
Expand Down Expand Up @@ -36,19 +35,23 @@ pub struct GitlabConfig {
pub uri: Url,
/// If absent personal access tokens must be provided.
pub admin_token: Option<String>,
// TODO use humantime-serde?
#[serde(default = "GitlabConfig::default_token_expiry")]
pub token_expiry: Duration,
pub token_expiry: time::Duration,
#[serde(default)]
pub ssl_cert: Option<String>,
/// Metadata format for fetching.
#[serde(default)]
pub metadata_format: MetadataFormat,
/// Cache file checksum fetches for all release older than this value.
#[serde(default, with = "humantime_serde")]
pub cache_releases_older_than: Option<Duration>,
}

impl GitlabConfig {
#[must_use]
const fn default_token_expiry() -> Duration {
Duration::days(30)
const fn default_token_expiry() -> time::Duration {
time::Duration::days(30)
}
}

Expand Down Expand Up @@ -95,3 +98,32 @@ pub fn from_toml_path<T: DeserializeOwned>(path: &str) -> Result<T, std::io::Err
let contents = std::fs::read(path)?;
toml::from_slice(&contents).map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
}

#[test]
fn deser_config() {
let conf = r#"
listen-address = "[::]:2222"
state-directory = "/var/lib/gitlab-cargo-shim"
[gitlab]
uri = "http://127.0.0.1:3000"
metadata-format = "json.zst"
cache-releases-older-than = "2 days""#;

let conf: Config = toml::from_str(conf).unwrap();
assert_eq!(
conf.state_directory.to_string_lossy(),
"/var/lib/gitlab-cargo-shim"
);
assert_eq!(conf.listen_address.to_string(), "[::]:2222");

let gitlab = conf.gitlab;
assert_eq!(gitlab.uri.as_str(), "http://127.0.0.1:3000/");
assert_eq!(gitlab.admin_token, None);
assert_eq!(gitlab.token_expiry, GitlabConfig::default_token_expiry());
assert_eq!(gitlab.ssl_cert, None);
assert_eq!(gitlab.metadata_format, MetadataFormat::JsonZst);
assert_eq!(
gitlab.cache_releases_older_than,
Some(Duration::from_secs(2 * 24 * 60 * 60))
);
}
116 changes: 76 additions & 40 deletions src/providers/gitlab.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// blocks_in_conditions: didn't work with `#[instrument...`` usage
#![allow(clippy::module_name_repetitions, clippy::blocks_in_conditions)]
mod checksums;

use crate::{
config::{GitlabConfig, MetadataFormat},
Expand All @@ -8,12 +9,14 @@ use crate::{
use anyhow::Context;
use async_trait::async_trait;
use backoff::backoff::Backoff;
use checksums::ChecksumCache;
use futures::{stream::FuturesUnordered, StreamExt, TryStreamExt};
use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
use reqwest::{header, Certificate};
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use time::{Duration, OffsetDateTime};
use smol_str::{format_smolstr, SmolStr};
use std::{sync::Arc, time::Duration};
use time::OffsetDateTime;
use tokio::sync::Semaphore;
use tracing::{debug, info_span, instrument, Instrument};
use url::Url;
Expand All @@ -24,9 +27,11 @@ const PARALLEL_PACKAGE_FILES_GETS: usize = 32;
pub struct Gitlab {
client: reqwest::Client,
base_url: Url,
token_expiry: Duration,
token_expiry: time::Duration,
metadata_format: MetadataFormat,
admin_token: Option<String>,
checksums: ChecksumCache,
cache_checksums_older_than: Option<Duration>,
}

impl Gitlab {
Expand All @@ -45,8 +50,49 @@ impl Gitlab {
token_expiry: config.token_expiry,
metadata_format: config.metadata_format,
admin_token: config.admin_token.clone(),
checksums: <_>::default(),
cache_checksums_older_than: config.cache_releases_older_than,
})
}

async fn fetch_checksum(
&self,
key: checksums::Key,
do_as: &User,
) -> anyhow::Result<Option<Arc<str>>> {
if let Some(chksum) = self.checksums.get(&key) {
return Ok(Some(chksum));
}

let package_files: Vec<GitlabPackageFilesResponse> = handle_error(
self.client
.get(key.fetch_url())
.user_or_admin_token(do_as, &self.admin_token)
.send_retry_429()
.await?,
)
.await?
.json()
.await?;

let Some(file) = package_files
.into_iter()
.find(|package_file| package_file.file_name == key.file_name)
else {
return Ok(None);
};

// if `cache_checksums_older_than` is configured and this file is old enough
// cache the checksum to avoid having to fetch again
if let Some(cache_older_than) = self.cache_checksums_older_than {
let cache_max_created = OffsetDateTime::now_utc() - cache_older_than;
if file.created_at < cache_max_created {
self.checksums.set(key, Arc::clone(&file.file_sha256));
}
}

Ok(Some(file.file_sha256))
}
}

#[async_trait]
Expand Down Expand Up @@ -219,7 +265,7 @@ impl super::PackageProvider for Gitlab {
let mut splitter = release.links.web_path.splitn(2, "/-/packages/");
match (splitter.next(), splitter.next()) {
(Some(project), Some(package)) => (&project[1..], package),
_ => return Ok(None),
_ => return anyhow::Ok(None),
}
};

Expand All @@ -229,40 +275,29 @@ impl super::PackageProvider for Gitlab {
.to_string(),
});

let package_files: Vec<GitlabPackageFilesResponse> = handle_error(
this.client
.get(format!(
"{}/projects/{}/packages/{}/package_files",
this.base_url,
utf8_percent_encode(project, NON_ALPHANUMERIC),
utf8_percent_encode(package, NON_ALPHANUMERIC),
))
.user_or_admin_token(&do_as, &this.admin_token)
.send_retry_429()
.await?,
)
.await?
.json()
.await?;

let expected_file_name =
format!("{}-{}.crate", release.name, release.version);

Ok::<_, anyhow::Error>(
package_files
.into_iter()
.find(|package_file| package_file.file_name == expected_file_name)
.map(move |package_file| {
(
Arc::clone(&package_path),
Release {
name: Arc::from(release.name),
version: release.version,
checksum: package_file.file_sha256,
},
)
}),
)
let key = checksums::Key {
base_url: this.base_url.as_str().into(),
project: project.into(),
package: package.into(),
file_name: format_smolstr!(
"{}-{}.crate",
release.name,
release.version
),
};

let checksum = this.fetch_checksum(key, &do_as).await?;

Ok(checksum.map(|checksum| {
(
Arc::clone(&package_path),
Release {
name: Arc::from(release.name),
version: release.version,
checksum,
},
)
}))
}
.instrument(info_span!("fetch_package_files")),
);
Expand Down Expand Up @@ -357,8 +392,9 @@ pub struct GitlabImpersonationTokenResponse {

#[derive(Deserialize)]
pub struct GitlabPackageFilesResponse {
pub file_name: String,
pub file_sha256: String,
pub file_name: SmolStr,
pub created_at: time::OffsetDateTime,
pub file_sha256: Arc<str>,
}

#[derive(Deserialize)]
Expand Down
40 changes: 40 additions & 0 deletions src/providers/gitlab/checksums.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
use parking_lot::RwLock;
use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
use smol_str::SmolStr;
use std::{collections::HashMap, sync::Arc};

/// Cache of fetched `/package_files` checksums fetched from
/// <https://docs.gitlab.com/ee/api/packages.html#list-package-files>
#[derive(Debug, Default)]
pub struct ChecksumCache {
checksums: RwLock<HashMap<Key, Arc<str>>>,
}

impl ChecksumCache {
pub fn get(&self, key: &Key) -> Option<Arc<str>> {
self.checksums.read().get(key).cloned()
}

pub fn set(&self, key: Key, checksum: Arc<str>) {
self.checksums.write().insert(key, checksum);
}
}

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Key {
pub base_url: SmolStr,
pub project: SmolStr,
pub package: SmolStr,
pub file_name: SmolStr,
}

impl Key {
pub fn fetch_url(&self) -> String {
format!(
"{}/projects/{}/packages/{}/package_files",
self.base_url,
utf8_percent_encode(self.project.as_str(), NON_ALPHANUMERIC),
utf8_percent_encode(self.package.as_str(), NON_ALPHANUMERIC),
)
}
}
2 changes: 1 addition & 1 deletion src/providers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,5 @@ pub type ReleaseName = Arc<str>;
pub struct Release {
pub name: ReleaseName,
pub version: String,
pub checksum: String,
pub checksum: Arc<str>,
}

0 comments on commit db6aede

Please sign in to comment.