Skip to content

Instantly share code, notes, and snippets.

@JamesChevalier
Created January 5, 2019 18:56
Show Gist options
  • Save JamesChevalier/2211cb290102643031fccb37d5e86280 to your computer and use it in GitHub Desktop.
Save JamesChevalier/2211cb290102643031fccb37d5e86280 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
require 'rubygems'
require 'bundler/setup'
require 'nokogiri'
require 'json'
require 'ap'
# Small script for scraping POIs from JOSM (http://wiki.openstreetmap.org/wiki/JOSM_file_format).
class Parser
# Tags in data that we ignore.
IGNORED_TAGS = ['attribution', 'condition', 'created_by', 'highway', 'lanes', 'maxspeed',
'network', 'ref', 'source', 'source:hgv', 'source:maxspeed', 'surface']
def initialize
# Map storing attribute name mapped on count of nodes containing it.
# It is helpful to see what tags should be taken into account in the first place during importing.
@popular_attributes = {}
# number of nodes parsed
@count = 0
# number of total xml nodes went through
@total_count = 0
# number of entries considered useful as POI
@included = 0
end
def parse(input, output)
out = File.new(output, 'w')
begin
out.write "[\n"
# Nokogiri reader created.
reader = Nokogiri::XML::Reader(File.new(input))
while reader = parse_node(reader, out)
end
ensure
out.write "{}\n]\n"
out.close
ap @popular_attributes.sort_by { |k, v| v }.reverse
STDERR.puts ''
puts "\n#{@included} / #{@count} / #{@total_count}\t"
end
end
def parse_node(r, out)
# Search for 'node' tags because they contain data (points).
# Other tags are discarded.
(r = r.read; progress) while r && r.name != 'node'
# Stop processing if end of file
return false unless r
# Create entry to be enriched with 'tag' data
entry = { id: r.attribute('id'), lat: r.attribute('lat'), lon: r.attribute('lon') }
while (progress; r = r.read)
# Next node found, so no more tags.
break if r.name == 'node'
end
@included += 1
puts "Writing #{entry}"
out.write(entry.to_json)
out.write(",\n")
progress(true)
return r
end
# Progress info
def progress(entry_found = false)
@total_count += 1;
@count += 1 if entry_found
limit = 10000
if (@total_count % limit) == 0
STDERR.print '.'
STDERR.print "\r#{@included} / #{@count} / #{@total_count}\t" if (@total_count % (limit * 50)) == 0
STDERR.flush
end
end
end
if ARGV.size < 2
puts "Usage: #{$PROGRAM_NAME} osm_file output_json"
exit 1
end
Parser.new.parse ARGV[0], ARGV[1]
#!/usr/bin/env ruby
require 'rubygems'
require 'bundler/setup'
require 'nokogiri'
require 'json'
require 'ap'
# Small script for scraping POIs from JOSM (http://wiki.openstreetmap.org/wiki/JOSM_file_format).
class Parser
# Tags in data that we ignore.
IGNORED_TAGS = ['attribution', 'condition', 'created_by', 'highway', 'lanes', 'maxspeed',
'network', 'ref', 'source', 'source:hgv', 'source:maxspeed', 'surface']
def initialize
# Map storing attribute name mapped on count of nodes containing it.
# It is helpful to see what tags should be taken into account in the first place during importing.
@popular_attributes = {}
# number of nodes parsed
@count = 0
# number of total xml nodes went through
@total_count = 0
# number of entries considered useful as POI
@included = 0
end
def parse(input, output)
out = File.new(output, 'w')
begin
out.write "[\n"
# Nokogiri reader created.
reader = Nokogiri::XML::Reader(File.new(input))
while reader = parse_node(reader, out)
end
ensure
out.write "{}\n]\n"
out.close
ap @popular_attributes.sort_by { |k, v| v }.reverse
STDERR.puts ''
puts "\n#{@included} / #{@count} / #{@total_count}\t"
end
end
def parse_node(r, out)
# Search for 'way' tags because they contain street information.
# Other tags are discarded.
(r = r.read; progress) while r && r.name != 'way'
# Stop processing if end of file
return false unless r
# Create entry to be enriched with 'tag' data
entry = {}
entry['nodes'] = {}
i = 0
# Required fields to create usable POI.
req = ['name']
while (progress; r = r.read)
# Next node found, so no more tags.
break if r.name == 'way'
# Only 'tag' are interesting.
if r.name == 'tag'
# Each tag has form of <tag k="key" v="value" />
key = r.attribute 'k'
unless IGNORED_TAGS.include? key
req.delete key
entry[key] = r.attribute 'v'
@popular_attributes[key] ||= 0
@popular_attributes[key] += 1
end
elsif r.name == 'nd'
# Each nd has form of <nd ref="value" />
unless IGNORED_TAGS.include? key
req.delete key
entry['nodes'][i] = r.attribute 'ref'
@popular_attributes[key] ||= 0
@popular_attributes[key] += 1
i += 1
end
end
end
# If all required tags were found.
if req.size == 0
@included += 1
out.write(entry.to_json)
out.write(",\n")
end
progress(true)
return r
end
# Progress info
def progress(entry_found = false)
@total_count += 1;
@count += 1 if entry_found
limit = 10000
if (@total_count % limit) == 0
STDERR.print '.'
STDERR.print "\r#{@included} / #{@count} / #{@total_count}\t" if (@total_count % (limit * 50)) == 0
STDERR.flush
end
end
end
if ARGV.size < 2
puts "Usage: #{$PROGRAM_NAME} osm_file output_json"
exit 1
end
Parser.new.parse ARGV[0], ARGV[1]
@JamesChevalier
Copy link
Author

Suggested update to ignore certain non-runnable features:
https://repl.it/@BrianSperlongan/QuickwittedIndolentNonagon

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment