require 'csv' require 'etc' require 'open3' require 'rubygems' require 'rubygems/package' require 'tmpdir' # require 'licensee' # Constants CSV_HEADER = "gem2rpm_and_fedora_matches?;license_validate_exit_code;gem_name;fedora_license;gem2rpm_license;dead_package".freeze class ThreadWorker MAX_THREADS = Etc.nprocessors # Expecting an array of items and a block def initialize(items) raise ArgumentError, 'Worker queue received no work...' unless items raise ArgumentError, 'Provide block to execute in threads with items.' unless block_given? @items = items.to_a # If we more threads than items, then we can't slice it slices = if @items.count <= MAX_THREADS [@items] else # + 1 to slice size should prevent spawning more threads # than we have HW threads (Yes, MRI does not have hw:sw # mapping, but they spawn subprocesses that do execute on HW # thread) @items.each_slice((@items.count / MAX_THREADS) + 1) end @worker_pool = [] slices.each do |slice| @worker_pool << Thread.new do yield slice end end end def gather_pool(flatten_level = 1) @worker_pool.map(&:value).flatten(flatten_level) end class CommandError < StandardError attr_reader :status, :stdout, :stderr def initialize(msg, stdout, stderr, status) super msg @stdout = stdout @stderr = stderr @status = status end end class << self def execute(command, pwd: nil, ret_exit: false) options = {} options[:chdir] = pwd if pwd $stderr.puts "Executing: #{command}" sleep 0.5 stdout, stderr, status = Open3.capture3(command, options) raise CommandError.new("Command failed: #{command}; #{stderr}; #{stdout}", stdout, stderr, status.exitstatus) unless status.success? if ret_exit status.exitstatus else stdout end end end end csv = ThreadWorker.new(Dir['rubygem-*.spec']) do |slice| slice.map do |file| name, spec_license = nil, nil file = File.read(file) name = file.scan(/%(?:global|define)[[:space:]]+gem_name[[:space:]]+([A-Za-z0-9\-_.]+)/).flatten.first.strip raise "name empty #{name} #{spec_license}" if name.empty? spec_license = file.scan(/License:[[:space:]]*(.*)/).flatten.first.strip [name, spec_license] end end.gather_pool # Fetch the Fedora sources ThreadWorker.new(csv) do |slice| slice.map do |row| name = "rubygem-#{row[0]}" next if Dir.exist?(name) counter = 0 begin ThreadWorker.execute("fedpkg clone -a #{name}") rescue ThreadWorker::CommandError => e counter += 1 retry if counter <= 3 raise e end end end.gather_pool # Sigh... they have tar as rubygem source... EXCLUDED_SOURCES = %w[rubygem-morph-cli rubygem-krb5-auth rubygem-asciidoctor rubygem-rgen rubygem-net-irc].freeze orphaned = [] # Fetch the gem from lookaside cache ThreadWorker.new(csv) do |slice| slice.map do |row| name = "rubygem-#{row[0]}" dir = Dir["#{name}/*.gem"] dead = Dir["#{name}/dead.package"] unless dead.empty? orphaned << name next end next unless dir.empty? next if EXCLUDED_SOURCES.include? name raise "#{dir} ; too much stuff" if dir.size > 1 puts "sources for #{name}" counter = 0 begin ThreadWorker.execute("fedpkg sources", pwd: name) rescue ThreadWorker::CommandError => e counter += 1 retry if counter <= 3 raise e end end end.gather_pool.reject(&:nil?) res = if File.exist?("gem2rpm.cache") File.read("gem2rpm.cache").split("\n") else # Create CSV from the gems, so that we have smth to compare. gems = Dir['*/*.gem'] out = ThreadWorker.new(gems) do |slice| slice.map do |gem_file| # template: # <%# gem_name;gem_version;fedora_license;license_file %> # <%= spec.name %>;<%= spec.version %>;<%= spec.licenses.join(" and ") %>;<%= main_files.filter do |item| item.license? end.join(" ")%> ThreadWorker.execute("gem2rpm --template ./template.erb #{gem_file} --local") end end.gather_pool.map(&:lstrip) File.write("gem2rpm.cache", out.join('')) out.map(&:rstrip) end res = res.map { |str| str.split(';') } def licensee_mit(gem_path, license_file) # Match the MIT license against this text... let's see if it even helps curr_dir = Dir.pwd license = nil content = nil Dir.mktmpdir do |destination| the_gem = Gem::Package.new(File.join(curr_dir, gem_path)) the_gem.contents # get the files in the gem the_gem.extract_files destination # extract the gem into a directory content = File.read(File.join(destination, license_file)) license = Licensee.license(File.join(destination,license_file)) # In case of an exception, it is needed to debug what went wrong (nonexistant dir, nonexistant file even despite guards...) rescue => e require 'irb'; binding.irb end # raise "\n"+license unless license.gsub(/[[:space:]]/, '') =~ regex # rescue return "TRUE valid MIT" if license && license.spdx_id == "MIT" "FALSE, inspection required" end def licensee_general(gem_path, license_file) curr_dir = Dir.pwd license = nil content = nil Dir.mktmpdir do |destination| the_gem = Gem::Package.new(File.join(curr_dir, gem_path)) the_gem.contents # get the files in the gem the_gem.extract_files destination # extract the gem into a directory if license_file content = File.read(File.join(destination, license_file)) license = Licensee.license(File.join(destination,license_file)) else license = Licensee.license(destination) end # In case of an exception, it is needed to debug what went wrong (nonexistant dir, nonexistant file even despite guards...) rescue => e require 'irb'; binding.irb end if license && license.spdx_id != "other" "The license might be #{license.spdx_id}" else "Inspection required" end end # Let's check the correct licensing sometime later # ret = res.map do |arr| # gem2rpm_name = arr[0] # gem2rpm_ver = arr[1] # gem2rpm_license = arr[2] # gem2rpm_license_file = arr[3] # # fedora_gem = csv.find { |row| row["gem_name"] == gem2rpm_name } # fedora_name = fedora_gem["gem_name"] # fedora_license = fedora_gem["fedora_license"] # fedora_gem_license = fedora_gem["gem_license"] # # raise "The names of gems differ. Leading me to this is gem2rpm: #{gem2rpm_name} fedora: #{fedora_name}" if gem2rpm_name != fedora_name # # str = '' # if fedora_license == fedora_gem_license && fedora_license == gem2rpm_license # str = "fedora matches gem2rpm" # # validate_mit_text(gem_path, license_file_name) # if fedora_license =~ /MIT/ && gem2rpm_license_file && !gem2rpm_license_file.empty? # str += ";" + licensee_mit(Dir["rubygem-#{fedora_name}/#{fedora_name}*.gem"].sort.first, gem2rpm_license_file) # elsif fedora_license =~ /MIT/ # str += ";" + "inspection required, MIT without licensefile detected" # elsif fedora_license == "BSD-2-Clause" || fedora_license == "BSD-3-Clause" || fedora_license == "Apache-2.0" # str += ";" + "Valid SPDX ID, no intervention required." # else # begin # # ThreadWorker.execute("license-validate #{fedora_license}") # str += ';' + licensee_general(Dir["rubygem-#{fedora_name}/#{fedora_name}*.gem"].sort.first, gem2rpm_license_file) # rescue RuntimeError => e # puts e.message # end # end # str # else # str = "Fedora does not match gem2rpm" + ";" + "Inspection needed" # end # str + ";" + fedora_name + ";" + fedora_license.to_s + ";" + gem2rpm_license.to_s # end # Somehow the following block changes contents of res, let's deep copy res2 = Marshal.load Marshal.dump(res) license_check = ->(fedora_license) { begin status = ThreadWorker.execute("license-validate \"#{fedora_license}\"", ret_exit: true).to_s "#{status}" rescue ThreadWorker::CommandError => e if e.status == 1 then "#{e.status}" else "#{e.stderr}######{e.stdout}" end end } ret = ThreadWorker.new(res)do |slice| slice.map do |arr| gem2rpm_name = arr[0] gem2rpm_ver = arr[1] gem2rpm_license = arr[2]&.strip gem2rpm_license_file = arr[3] fedora_license = csv.find { |row| row[0] == gem2rpm_name }[1].strip fedora_name = gem2rpm_name # begin # fedora_name = fedora_gem["gem_name"] # rescue => e # require 'irb';binding.irb # end # fedora_license = fedora_gem["fedora_license"]&.strip # fedora_gem_license = fedora_gem["gem_license"] # raise "The names of gems differ. Leading me to this is gem2rpm: #{gem2rpm_name} fedora: #{fedora_name}" if gem2rpm_name != fedora_name str = '' if fedora_license == gem2rpm_license # Matches str = true.to_s res = license_check.call(fedora_license) str += ";" + res str else # Doesn't match str = false.to_s + ";" + license_check.call(fedora_license) end str + ";" + fedora_name + ";" + fedora_license.to_s + ";" + gem2rpm_license.to_s end end.gather_pool(2) orphaned = orphaned.map { |package| "#{false};255;#{package};N/A;N/A;#{true}" } sort_by_license_check = ->(a,b) { c = a.split(";"); d = b.split(";"); c[1] <=> d[1] } final = ret.sort(&sort_by_license_check).union(orphaned).unshift(CSV_HEADER) puts final # The state of Fedora Rubygems (excl a few that dont have gem as their source in Fedora lookaside cache) general_state = final matching_only = final.find_all { |a| a.split(";")[0] == "true" } license_check_ok = final.find_all { |a| a.split(";")[1] == "0" } no_action = final.find_all { |a| b = a.split(";"); b[0] == "true" && b[1] == "0" } File.write("rubygems_fedora_spdx_state.csv", general_state.join("\n")) # Where Fedora license == gem2rpm File.write("rubygems_fedora_gem2rpm_matches.csv", matching_only.join("\n")) # Where license-check returns 0 File.write("rubygems_fedora_valid_spdx.csv", license_check_ok.join("\n")) # Licenses where we know there is valid SPDX and the licenses match File.write("rubygems_fedora_valid_no_action.csv", no_action.join("\n")) try_convert = ThreadWorker.new(res2) do |slice| slice.map do |arr| gem2rpm_name = arr[0] gem2rpm_ver = arr[1] gem2rpm_license = arr[2]&.strip&.gsub(" and ", " AND ")&.gsub(" or ", " OR ") gem2rpm_license_file = arr[3] fedora_license = csv.find { |row| row[0] == gem2rpm_name }[1].strip fedora_name = gem2rpm_name # begin # fedora_name = fedora_gem["gem_name"] # rescue => e # require 'irb';binding.irb # end # fedora_license = fedora_gem["fedora_license"]&.strip # fedora_gem_license = fedora_gem["gem_license"] # raise "The names of gems differ. Leading me to this is gem2rpm: #{gem2rpm_name} fedora: #{fedora_name}" if gem2rpm_name != fedora_name fedora_license = fedora_license&.strip&.gsub(" and ", " AND ")&.gsub(" or ", " OR ") # fedora_gem_license = fedora_gem["gem_license"] # raise "The names of gems differ. Leading me to this is gem2rpm: #{gem2rpm_name} fedora: #{fedora_name}" if gem2rpm_name != fedora_name str = '' if fedora_license == gem2rpm_license # Matches str = true.to_s res = license_check.call(fedora_license) str += ";" + res str else # Doesn't match str = false.to_s + ";" + license_check.call(fedora_license) end str + ";" + fedora_name + ";" + fedora_license.to_s + ";" + gem2rpm_license.to_s end end.gather_pool(2) .sort(&sort_by_license_check) .unshift(CSV_HEADER) converted = try_convert.find_all { |a| a.split(";")[1] == "0" } File.write("rubygems_try_convert_conjunctions.csv", converted.join("\n")) if ARGV[0] == '--email' total = Dir['rubygem-*.spec'].to_a.size total_checked = general_state.size - 1 matching = matching_only.size - 1 license_checked = license_check_ok.size - 1 no_act = no_action.size - 1 conv = converted.size - 1 action = total_checked - license_checked to_fix = general_state - license_check_ok max_left = to_fix.map { |str| CSV.parse str, col_sep: ';' } .flatten(1) .max(1) { |a,b| "rubygem-#{a[2]},".length <=> "rubygem#{b[2]},".length } .map { |a| "rubygem#{a[2]},".length } .flatten .first package_list = to_fix .map { |str| CSV.parse(str, col_sep: ';') } .flatten(1) .map { |arr| "rubygem-#{arr[2]},".ljust(max_left + 1) + arr[3].to_s } # require 'irb'; binding.irb puts " * Total rubygems in Fedora: #{total} * Total rubygems checked: #{total_checked} * Fedora License field and gem2rpm license match and license-validate succeeds: #{no_act}/#{total_checked} * license-validate says they are OK SPDX, but licenses may or may not match between Fedora and upstream: #{license_checked}/#{total_checked} * license-validate with converted conjunctions: #{conv}/#{total_checked} * Action required: #{action}/#{total_checked} Packages that need to convert to valid SPDX: #{package_list.join("\n")} " end