Skip to content

Instantly share code, notes, and snippets.

@georgy7
Created May 6, 2017 12:17
Show Gist options
  • Save georgy7/8a23696ff128df0ed11b0ffe20686964 to your computer and use it in GitHub Desktop.
Save georgy7/8a23696ff128df0ed11b0ffe20686964 to your computer and use it in GitHub Desktop.
Статьи с тегами. Тестовые данные для бенчмарков.
.idea
.directory
data
#!/usr/bin/env ruby
# encoding: utf-8
require 'securerandom'
require 'set'
require 'json'
TAG_COUNT = 10_000
ARTICLE_COUNT = 1_000_000
def randstr
SecureRandom.base64.gsub(/\//, '').gsub(/\\/, '').gsub(/\+/, '').gsub(/=/, '')
end
def genTags
tagNameSet = Set.new
result_array = []
tags_json = File.open('data/tags.json', 'w')
tags_txt = File.open('data/tags.txt', 'w')
tags_json.puts('[')
1.upto(TAG_COUNT) { |i|
# Тут вероятность пересечения выше, чем если бы я не обрезал строки.
# Так что я беру тег, и если он уже был, пробую снова,
# пока не найду такой, которого еще не было.
begin
tag = randstr[0..8]
end while tagNameSet.include?(tag)
tagNameSet.add(tag)
result_array.push(tag)
tags_txt.puts(tag)
tags_json.write(" \"#{tag}\"")
if TAG_COUNT == i
tags_json.puts
else
tags_json.puts ','
end
}
tags_json.puts(']')
tags_json.close
tags_txt.close
result_array
end
def gen_article_names
result_array = []
article_names_json = File.open('data/article_names.json', 'w')
article_names_txt = File.open('data/article_names.txt', 'w')
article_names_json.puts('[')
1.upto(ARTICLE_COUNT) { |i|
title = randstr
result_array.push(title)
article_names_txt.puts(title)
article_names_json.write(" \"#{title}\"")
if ARTICLE_COUNT == i
article_names_json.puts
else
article_names_json.puts ','
end
}
article_names_json.puts(']')
article_names_json.close
article_names_txt.close
result_array
end
def unique_randoms(count, max_exclusively)
result = Set.new
loop do
r = rand(max_exclusively)
raise 'bad random (too big)' unless r < max_exclusively
result.add(r)
return result.to_a if result.size >= count
end
end
def gen_references(tag_names, article_names, maxtagsperpage)
references = File.open("data/references_only_#{maxtagsperpage}.json", 'w')
articles_with_tags = File.open("data/articles_with_tags_#{maxtagsperpage}.json", 'w')
references.puts('[')
articles_with_tags.puts('[')
article_names.each_with_index { |article, article_index|
tags_count = rand(0..maxtagsperpage)
articletagindices = unique_randoms(tags_count, tag_names.size)
selected_article_names = []
articletagindices.each { |ti|
selected_article_names.push(tag_names[ti])
}
references.write(" #{article_index}: #{JSON.generate(articletagindices)}")
articles_with_tags.write(" {\"title\": \"#{article}\", \"tags\": #{JSON.generate(selected_article_names)}}")
if article_index == article_names.size - 1
references.puts
articles_with_tags.puts
else
references.puts ','
articles_with_tags.puts ','
end
}
references.puts(']')
articles_with_tags.puts(']')
references.close
articles_with_tags.close
end
def main
Dir.mkdir('data') unless Dir.exist?('data')
tag_names = genTags
article_names = gen_article_names
raise 'bad tag_names size' unless tag_names.size == TAG_COUNT
raise 'bad article_names size' unless article_names.size == ARTICLE_COUNT
raise 'article_names has duplicates' unless article_names.size == article_names.uniq.size
# Генерирую три разных независимых набора данных.
gen_references(tag_names, article_names, 20)
gen_references(tag_names, article_names, 100)
gen_references(tag_names, article_names, 1000)
end
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment