wvengen · March 3, 2020 12:27
diff --git a/gather-fish-data-fishbase.rb b/gather-fish-data-fishbase.rb
 #!/usr/bin/env ruby
 #
 # We start with the list at:
 #   http://www.fao.org/fishery/collection/asfis
 # Then we query FishBase (and SeaLifeBase) for the ID and NL name.
 #
 # Before running this script, make sure you have the ASFIS spreadsheet,
 # and make sure you've created the CACHEDIR directory (which stores all
 # requests, so you can run this again without having to wait for the requests).
 #
 # More sources:
 #   http://www.fishbase.org/
 #   http://www.sealifebase.org/
 #   http://www.fao.org/fishery/species/search
 #
 require 'net/http'
 require 'nokogiri'
 require 'roo'

 FAO_SHEET = 'ASFIS_sp_2019.xlsx'
 CACHEDIR = './fishbase-cache'

 out = CSV.open('asfis_fishbase.csv', 'w')
 fao = Roo::Spreadsheet.open(FAO_SHEET)

 batch = []

 def with_cache(cachekeys)
  cachekey = cachekeys.join('-').gsub(/[^-a-z0-9]+/i, '_')
  cachefile = File.join(CACHEDIR, cachekey + '.html')
  if File.exists?(cachefile)
    File.read(cachefile)
  else
    body = yield
    File.write(cachefile, body)
    body
  end

 end

 def http_post_form(uri, params, cachekeys)
  with_cache(cachekeys) do
    sleep 0.5
    Net::HTTP.post_form(uri, params).body
  end
 end

 def http_get(uri, cachekeys)
  with_cache(cachekeys) do
    sleep 0.5
    Net::HTTP.get_response(uri).body
  end
 end


 def find_fishbase(s, baseurl = 'https://www.fishbase.se/', idkey='fishbase_id')
  result = {}
  genus, species = s.split(/\s+/, 2).map(&:strip)

  # first get id
  search_uri = URI::join(baseurl, '/Nomenclature/ScientificNameSearchList.php')
  body = http_post_form(search_uri, {
    'crit1_fieldname': 'SYNONYMS.SynGenus',
    'crit1_fieldtype': 'CHAR',
    'crit2_fieldname': 'SYNONYMS.SynSpecies',
    'crit2_fieldtype': 'CHAR',
    'crit1_operator': 'EQUAL',
    'crit1_value': genus,
    'crit2_operator': 'EQUAL',
    'crit2_value': species
  }, [idkey, 'lookup', genus, species])

  if body =~ /SpeciesSummary.php\?ID=(\d+)\b/
    result[idkey] = id = $1.to_i

    # then get common names
    body = http_get(URI::join(baseurl, "/ComNames/CommonNamesList.php?ID=#{id}"), [idkey, 'names', id])
    doc = Nokogiri::HTML.parse(body)
    if cell = doc.at_css('td:contains("English")')
      result['label_en'] = cell.ancestors('tr').first.css('td:first-child').text.strip
    end
    if cell = doc.at_css('td:contains("Dutch")')
      result['label_nl'] = cell.ancestors('tr').first.css('td:first-child').text.strip
    end
  elsif idkey != 'sealifebase_id'
    result = find_fishbase(s, 'https://www.sealifebase.se/', 'sealifebase_id')
  end

  result
 end


 def process_row(row, out)
  taxocode = row['TAXOCODE']
  alpha = row['3A_CODE']
  species = row['Scientific_name']
  family = row['Family']
  order = row['Order'].capitalize
  name_en = row['English_name']
  w_name_en = nil
  w_name_nl = nil
  fishbase_id = nil
  sealifebase_id = nil

  if wds = find_fishbase(species.strip.downcase)
    w_name_en = wds['label_en']
    w_name_nl = wds['label_nl']
    fishbase_id = wds['fishbase_id']
    sealifebase_id = wds['sealifebase_id']

    w_name_en = nil if w_name_en&.downcase == species.downcase
    w_name_nl = nil if w_name_nl&.downcase == species.downcase
  end

  out << [alpha, taxocode, fishbase_id, sealifebase_id, name_en, w_name_en, w_name_nl, species, family, order]
 end


 row_count = fao.sheet(0).count
 fao.sheet(0).parse(headers: true).each_with_index do |row, i|
  next if i == 0
  #next unless row['English_name']
  STDERR.puts "Row #{i} of #{row_count}" if i%20 == 0
  process_row(row, out)
 end
diff --git a/gather-fish-data-wikidata.rb b/gather-fish-data-wikidata.rb
 #!/usr/bin/env ruby
 #
 # We start with the list at:
 #   http://www.fao.org/fishery/collection/asfis
 # Save the spreadsheet as ASFIS_sp_2019.xlsx and run this script,
 # which queries Wikidata for the Dutch (NL) name and fishbase_id.
 # Data is stored in asfis_wikidata.csv.
 #
 # Note that the Wikidata SPARQL endpoint can sometimes timeout, so
 # I'd go for the FishBase approach instead.
 #
 # More sources:
 #   https://www.fishbase.org/
 #   http://www.fao.org/fishery/species/search
 #
 require 'roo'
 require 'sparql/client'

 FAO_SHEET = 'ASFIS_sp_2019.xlsx'
 BATCH_SIZE = 35
 OUT_CSV = 'asfis_wikidata.csv'

 def query(sparql, terms)
  result = sparql.query(<<-EOS)
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wikibase: <http://wikiba.se/ontology#>

    SELECT
      ?item
      ?taxon_name
      ?label_en
      ?label_nl
      ?fishbase_id
    WHERE
    {
      ?item wdt:P225 ?taxon_name .
      OPTIONAL { ?item wdt:P938 ?fishbase_id . }
      
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". ?item rdfs:label ?label_en }    
      SERVICE wikibase:label { bd:serviceParam wikibase:language "nl". ?item rdfs:label ?label_nl. hint:Prior hint:runLast false. }

      FILTER(?taxon_name IN (#{terms.map {|t| '"' + t + '"'}.join(',')}))
    }
  EOS

  result.each_solution do |q|
    yield(q)
  end
 end

 sparql = SPARQL::Client.new('https://query.wikidata.org/sparql', headers: {'User-Agent': 'QMWikidataGatherer/0.0 (https://www.thequestionmark.org/)'})
 out = CSV.open(OUT_CSV, 'w')
 fao = Roo::Spreadsheet.open(FAO_SHEET)

 batch = []


 def process_batch(sparql, batch, out)
  all_species = batch.map {|b| b['Scientific_name'] }.uniq
  wd_by_species = {}

  query(sparql, all_species) do |r|
    wd_by_species[r['taxon_name'].to_s] = {
      'label_en' => r['label_en'].to_s&.strip,
      'label_nl' => r['label_nl'].to_s&.strip,
      'fishbase_id' => r['fishbase_id'].to_s&.to_i
    }
  end

  batch.each do |row|
    taxocode = row['TAXOCODE']
    alpha = row['3A_CODE']
    species = row['Scientific_name']
    family = row['Family']
    order = row['Order'].capitalize
    name_en = row['English_name']
    w_name_en = nil
    w_name_nl = nil
    firebase_id = nil

    if wds = wd_by_species[species]
      w_name_en = wds['label_en']
      w_name_nl = wds['label_nl']
      fishbase_id = wds['fishbase_id']

      w_name_en = nil if w_name_en&.downcase == species.downcase
      w_name_nl = nil if w_name_nl&.downcase == species.downcase
    end

    out << [alpha, taxocode, fishbase_id, name_en, w_name_en, w_name_nl, species, family, order]
  end
 end


 row_count = fao.sheet(0).count
 fao.sheet(0).parse(headers: true).each_with_index do |row, i|
  next if i == 0
  batch << row

  if batch.length > BATCH_SIZE
    STDERR.puts "Row #{i} of #{row_count}"
    process_batch(sparql, batch, out)
    batch = []
    sleep 1
  end

 end

 process_batch(sparql, batch, out)
	#!/usr/bin/env ruby
	#
	# We start with the list at:
	# http://www.fao.org/fishery/collection/asfis
	# Then we query FishBase (and SeaLifeBase) for the ID and NL name.
	#
	# Before running this script, make sure you have the ASFIS spreadsheet,
	# and make sure you've created the CACHEDIR directory (which stores all
	# requests, so you can run this again without having to wait for the requests).
	#
	# More sources:
	# http://www.fishbase.org/
	# http://www.sealifebase.org/
	# http://www.fao.org/fishery/species/search
	#
	require 'net/http'
	require 'nokogiri'
	require 'roo'

	FAO_SHEET = 'ASFIS_sp_2019.xlsx'
	CACHEDIR = './fishbase-cache'

	out = CSV.open('asfis_fishbase.csv', 'w')
	fao = Roo::Spreadsheet.open(FAO_SHEET)

	batch = []

	def with_cache(cachekeys)
	cachekey = cachekeys.join('-').gsub(/[^-a-z0-9]+/i, '_')
	cachefile = File.join(CACHEDIR, cachekey + '.html')
	if File.exists?(cachefile)
	File.read(cachefile)
	else
	body = yield
	File.write(cachefile, body)
	body
	end

	end

	def http_post_form(uri, params, cachekeys)
	with_cache(cachekeys) do
	sleep 0.5
	Net::HTTP.post_form(uri, params).body
	end
	end

	def http_get(uri, cachekeys)
	with_cache(cachekeys) do
	sleep 0.5
	Net::HTTP.get_response(uri).body
	end
	end


	def find_fishbase(s, baseurl = 'https://www.fishbase.se/', idkey='fishbase_id')
	result = {}
	genus, species = s.split(/\s+/, 2).map(&:strip)

	# first get id
	search_uri = URI::join(baseurl, '/Nomenclature/ScientificNameSearchList.php')
	body = http_post_form(search_uri, {
	'crit1_fieldname': 'SYNONYMS.SynGenus',
	'crit1_fieldtype': 'CHAR',
	'crit2_fieldname': 'SYNONYMS.SynSpecies',
	'crit2_fieldtype': 'CHAR',
	'crit1_operator': 'EQUAL',
	'crit1_value': genus,
	'crit2_operator': 'EQUAL',
	'crit2_value': species
	}, [idkey, 'lookup', genus, species])

	if body =~ /SpeciesSummary.php\?ID=(\d+)\b/
	result[idkey] = id = $1.to_i

	# then get common names
	body = http_get(URI::join(baseurl, "/ComNames/CommonNamesList.php?ID=#{id}"), [idkey, 'names', id])
	doc = Nokogiri::HTML.parse(body)
	if cell = doc.at_css('td:contains("English")')
	result['label_en'] = cell.ancestors('tr').first.css('td:first-child').text.strip
	end
	if cell = doc.at_css('td:contains("Dutch")')
	result['label_nl'] = cell.ancestors('tr').first.css('td:first-child').text.strip
	end
	elsif idkey != 'sealifebase_id'
	result = find_fishbase(s, 'https://www.sealifebase.se/', 'sealifebase_id')
	end

	result
	end


	def process_row(row, out)
	taxocode = row['TAXOCODE']
	alpha = row['3A_CODE']
	species = row['Scientific_name']
	family = row['Family']
	order = row['Order'].capitalize
	name_en = row['English_name']
	w_name_en = nil
	w_name_nl = nil
	fishbase_id = nil
	sealifebase_id = nil

	if wds = find_fishbase(species.strip.downcase)
	w_name_en = wds['label_en']
	w_name_nl = wds['label_nl']
	fishbase_id = wds['fishbase_id']
	sealifebase_id = wds['sealifebase_id']

	w_name_en = nil if w_name_en&.downcase == species.downcase
	w_name_nl = nil if w_name_nl&.downcase == species.downcase
	end

	out << [alpha, taxocode, fishbase_id, sealifebase_id, name_en, w_name_en, w_name_nl, species, family, order]
	end


	row_count = fao.sheet(0).count
	fao.sheet(0).parse(headers: true).each_with_index do \|row, i\|
	next if i == 0
	#next unless row['English_name']
	STDERR.puts "Row #{i} of #{row_count}" if i%20 == 0
	process_row(row, out)
	end