Skip to content

Commit

Permalink
Add task to verify indexing against Top 100 RubyGems (#2330)
Browse files Browse the repository at this point in the history
Add task to verify against against Top 100 RubyGems
  • Loading branch information
andyw8 authored Jul 24, 2024
1 parent 7885527 commit 77de05f
Show file tree
Hide file tree
Showing 4 changed files with 222 additions and 0 deletions.
25 changes: 25 additions & 0 deletions .github/workflows/indexing.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: CI (indexing)

on:
push:
paths:
- 'Gemfile.lock'
- 'lib/ruby_indexer/**'
pull_request:
paths:
- 'Gemfile.lock'
- 'lib/ruby_indexer/**'

jobs:
indexing_sanity_check:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Set up Ruby
uses: ruby/setup-ruby@v1
with:
bundler-cache: true

- name: Index Top 100 Ruby gems
run: bundle exec rake index:topgems
95 changes: 95 additions & 0 deletions rakelib/index.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# frozen_string_literal: true

# Based on https://github.com/ruby/prism/blob/main/rakelib/lex.rake

module GemIndexing
class << self
# This method is responsible for iterating through a list of items and running
# each item in a separate thread. It will block until all items have been
# processed. This is particularly useful for tasks that are IO-bound like
# downloading files or reading files from disk.
def parallelize(items, &block)
Thread.abort_on_exception = true

queue = Queue.new
items.each { |item| queue << item }

workers =
ENV.fetch("WORKERS") { 16 }.to_i.times.map do
parallelize_thread(queue, &block)
end

workers.map(&:join)
end

private

# Create a new thread with a minimal number of locals that it can access.
def parallelize_thread(queue, &block)
Thread.new { block.call(queue.shift) until queue.empty? }
end
end
end

TOP_100_GEM_FILENAME = "rakelib/top_100_gems.yml"
TOP_100_GEMS_DIR = "tmp/top_100_gems"

namespace :download do
directory TOP_100_GEMS_DIR

desc "Download the top 100 rubygems under #{TOP_100_GEMS_DIR}/"
task topgems: TOP_100_GEMS_DIR do
$LOAD_PATH.unshift(File.expand_path("../lib", __dir__))
require "net/http"
require "rubygems/package"
require "tmpdir"

GemIndexing.parallelize(YAML.safe_load_file(TOP_100_GEM_FILENAME)) do |gem_name|
directory = File.expand_path("#{TOP_100_GEMS_DIR}/#{gem_name}")
next if File.directory?(directory)

puts "Downloading #{gem_name}"

uri = URI.parse("https://rubygems.org/gems/#{gem_name}.gem")
response = Net::HTTP.get_response(uri)
raise gem_name unless response.is_a?(Net::HTTPSuccess)

Dir.mktmpdir do |tmpdir|
filepath = File.join(tmpdir, "#{gem_name}.gem")
File.write(filepath, response.body)
Gem::Package.new(filepath).extract_files(directory, "**/*.rb")
end
end
end
end

# This task indexes against the top 100 gems, and will exit(1) if any fail.
desc "Index against the top 100 rubygems"
task "index:topgems": ["download:topgems"] do
$LOAD_PATH.unshift(File.expand_path("../lib", __dir__))
require "net/http"
require "rubygems/package"
require "tmpdir"

gem_names = YAML.safe_load_file(TOP_100_GEM_FILENAME)

errors = []
GemIndexing.parallelize(gem_names) do |gem_name|
directory = File.expand_path("#{TOP_100_GEMS_DIR}/#{gem_name}")

index = RubyIndexer::Index.new

errors = Dir[File.join(directory, "**", "*.rb")].filter_map do |filepath|
print(".")
code = File.read(filepath)
index.index_single(RubyIndexer::IndexablePath.new(nil, filepath), code)
nil
rescue => e
errors << { message: e.message, file: filepath }
end
end

puts "errors: #{errors}" if errors.any?
ensure
FileUtils.rm_rf(TOP_100_GEMS_DIR)
end
101 changes: 101 additions & 0 deletions rakelib/top_100_gems.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
---
- actioncable-7.0.4.3
- actionmailbox-7.0.4.3
- actionmailer-7.0.4.3
- actionpack-7.0.4.3
- actiontext-7.0.4.3
- actionview-7.0.4.3
- activejob-7.0.4.3
- activemodel-7.0.4.3
- activerecord-7.0.4.3
- activestorage-7.0.4.3
- activesupport-7.0.4.3
- addressable-2.8.4
- autoprefixer-rails-10.4.13.0
- aws-partitions-1.744.0
- aws-sdk-cloudformation-1.77.0
- aws-sdk-cloudfront-1.76.0
- aws-sdk-cloudwatch-1.72.0
- aws-sdk-core-3.171.0
- aws-sdk-dynamodb-1.83.0
- aws-sdk-ec2-1.375.0
- aws-sdk-iam-1.77.0
- aws-sdk-kinesis-1.45.0
- aws-sdk-kms-1.63.0
- aws-sdk-lambda-1.93.0
- aws-sdk-rds-1.175.0
- aws-sdk-resources-3.162.0
- aws-sdk-s3-1.120.1
- aws-sdk-secretsmanager-1.73.0
- aws-sdk-sns-1.60.0
- aws-sdk-ssm-1.150.0
- backports-3.24.1
- brakeman-5.4.1
- bundler-2.4.11
- capybara-3.39.0
- concurrent-ruby-1.2.2
- connection_pool-2.4.0
- dalli-3.2.4
- database_cleaner-2.0.2
- devise-4.9.2
- dry-types-1.7.1
- elasticsearch-8.7.0
- elasticsearch-api-8.7.0
- excon-0.99.0
- faker-3.1.1
- faraday-retry-2.1.0
- fastlane-2.212.1
- fog-aws-3.18.0
- git-1.18.0
- google-cloud-errors-1.3.1
- google-protobuf-3.22.2
- googleauth-1.5.1
- graphql-2.0.21
- grpc-1.53.0
- jwt-2.7.0
- loofah-2.20.0
- mail-2.8.1
- mime-types-data-3.2023.0218.1
- minitest-5.18.0
- msgpack-1.7.0
- net-http-persistent-4.0.2
- net-ssh-7.1.0
- newrelic_rpm-9.1.0
- nio4r-2.5.9
- nokogiri-1.14.3
- octokit-6.1.1
- oj-3.14.3
- parser-3.2.2.0
- pg-1.4.6
- plist-3.7.0
- puma-6.2.1
- rack-3.0.7
- rack-cors-2.0.1
- rack-protection-3.0.6
- rack-test-2.1.0
- rails-7.0.4.3
- railties-7.0.4.3
- raindrops-0.20.1
- redis-store-1.9.2
- regexp_parser-2.7.0
- responders-3.1.0
- rouge-4.1.0
- rspec-core-3.12.1
- rspec-mocks-3.12.5
- rubocop-1.50.0
- rubocop-ast-1.28.0
- rubocop-performance-1.17.1
- rubocop-rails-2.19.0
- rubocop-rspec-2.19.0
- ruby-progressbar-1.13.0
- ruby_parser-3.20.0
- rubygems-update-3.4.11
- selenium-webdriver-4.8.6
- sidekiq-7.0.8
- sinatra-3.0.6
- slop-4.10.1
- sqlite3-1.6.2
- thin-1.8.2
- tilt-2.1.0
- yard-0.9.32
- zeitwerk-2.6.7
1 change: 1 addition & 0 deletions sorbet/config
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
--dir
.
--ignore=vendor/
--ignore=tmp/
--ignore=test/fixtures/
--ignore=test/expectations/
--enable-experimental-requires-ancestor

0 comments on commit 77de05f

Please sign in to comment.