"My data files are too large."
"I have many different files and I need to join them together."
- a fast and general-purpose cluster computing system
#!/usr/bin/env ruby | |
# encoding: utf-8 | |
require 'csv' | |
require 'dina' | |
Dina.config = { | |
authorization_url: 'https://dina.biodiversity.agr.gc.ca/auth', | |
endpoint_url: 'https://dina.biodiversity.agr.gc.ca/api', | |
server_name: 'dina-prod', |
# Q1312945 Expedition | |
qids = ["Q108669", "Q63760", "Q62747", "Q104839", "Q96384", "Q96384", "Q85444", "Q101823", "Q347529", "Q43881351", "Q95248572"] | |
user_ids = qids.map do |q| | |
u = User.find_by_identifier(q) rescue nil | |
u.id if !u.nil? | |
end.compact |
#!/usr/bin/env ruby | |
# encoding: utf-8 | |
require 'csv' | |
require 'rest_client' | |
require 'json' | |
BASE_URL = "https://api.openalex.org/works?filter=concepts.id:C58642233,has_orcid:true,publication_year:2023&per_page=50&page=" | |
def get_data(page:) |
# encoding: utf-8 | |
class String | |
def is_orcid? | |
/(\d{4}-){3}\d{3}[0-9X]{1}$/.match?(self) | |
end | |
def valid_orcid? | |
parts = self.scan(/[0-9X]/) | |
mod = parts[0..14].map(&:to_i) | |
.inject { |sum, n| (sum + n)*2 } |
# Install via command-line as 'gem install sparql-client' | |
require 'sparql/client' | |
headers = { 'User-Agent' => 'Ruby-Sparql-Client/1.0' } | |
@sparql = SPARQL::Client.new("https://query.wikidata.org/sparql", headers: headers, read_timeout: 120) | |
# A SPARQL query to find an item and an optional Twitter handle | |
def wikidata_by_orcid_query(orcid) | |
%Q( | |
SELECT ?item ?itemLabel ?twitter |
#!/usr/bin/env ruby | |
# encoding: utf-8 | |
require 'rest_client' | |
require 'csv' | |
require 'nokogiri' | |
require 'colorize' | |
page_range = 0..50 |
#!/usr/bin/env ruby | |
# encoding: utf-8 | |
require 'rest_client' | |
require 'csv' | |
require 'nokogiri' | |
require 'colorize' | |
page_range = 1..10 |
"My data files are too large."
"I have many different files and I need to join them together."
GBIF URL,recordedBy,eventDate,year,country,countryCode,GBIF Dataset | |
https://gbif.org/occurrence/2433942,"Mrs. C. Pease, Miss E. Butler",,1903,Jamaica,JM,https://gbif.org/dataset/40d2de00-0c6e-11dd-84d2-b8a03c50a862 | |
https://gbif.org/occurrence/29404620,U.Mizushima (Mrs.),1954-09-06T01:00Z,1954,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a | |
https://gbif.org/occurrence/29408002,U.Mizushima (Mrs.),1954-09-19T01:00Z,1954,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a | |
https://gbif.org/occurrence/29426346,U.Mizushima (Mrs.),1952-08-01T01:00Z,1952,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a | |
https://gbif.org/occurrence/29429161,U.Mizushima (Mrs.),1954-09-29T01:00Z,1954,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a | |
https://gbif.org/occurrence/29451087,U.Mizushima (Mrs.),1955-03-04T01:00Z,1955,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a | |
https://gbif.org/occurrence/29451907,Mrs. Fay A. Mac Fadden,1926-08-16T01:00Z,1926,, |
{:user_id=>11771, :name=>"Cyrus Pringle", :orphaned=>716}, | |
{:user_id=>14743, :name=>"Gerdt Guenther Hatschbach", :orphaned=>594}, | |
{:user_id=>191, :name=>"Volker Framenau", :orphaned=>586}, | |
{:user_id=>35074, :name=>"Martti Rautanen", :orphaned=>454}, | |
{:user_id=>10182, :name=>"Georg August Zenker", :orphaned=>381}, | |
{:user_id=>12169, :name=>"Paul Sintenis", :orphaned=>349}, | |
{:user_id=>9829, :name=>"Joseph Friedrich Nicolaus Bornmüller", :orphaned=>302}, | |
{:user_id=>10487, :name=>"José Arechavaleta", :orphaned=>250}, | |
{:user_id=>11937, :name=>"Theodor Kotschy", :orphaned=>233}, | |
{:user_id=>11853, :name=>"Ynes Mexia", :orphaned=>150}, |