Skip to content

Instantly share code, notes, and snippets.

@robert-b-clarke
Last active December 9, 2020 21:10
Show Gist options
  • Save robert-b-clarke/5228381 to your computer and use it in GitHub Desktop.
Save robert-b-clarke/5228381 to your computer and use it in GitHub Desktop.
Split a large gtfs filon e into separate ones for each agency - tested on http://www.datagm.org.uk/package/public-transport-schedules--gtfs
=head1 NAME
gtfs-splitter.pl
=head1 DESCRIPTION
Hastily assembled and extremely unguaranteed perl script for dividing gtfs files into smaller per agency files
=head1 SYNOPSIS
from command line
> unzip BigGTFS.ZIP
> perl gtfs-splitter.pl BigGTFS SmallerGTFS
#then zip up the individual gtfs directories
=cut
#!/usr/bin/perl
use Modern::Perl;
use Text::CSV_XS;
use Data::Dumper;
use File::Path qw/mkpath/;
use File::Copy;
my %opts;
my $container_path = $ARGV[0] or die "Need an output path";
my $output_path = $ARGV[1] or die "Need an output path";
#list of files we're gonna process
my @filenames = ('agency.txt', 'routes.txt', 'trips.txt', 'calendar.txt', 'stop_times.txt', 'calendar_dates.txt','stops.txt');
#split GTFS files according to agency
my %route_to_agency = (); #route id to agency id mapping
my %trip_to_agency = (); #trip id to agency id mapping
my %service_to_agency = (); #service id to agency id mapping
my %stop_to_agency = (); #stopid to agency - this one contains arrays
my $csv_in = Text::CSV_XS->new();
foreach my $filename(@filenames){
my $filepath = join('/', ($container_path, $filename));
open my $fh_in, '<', $filepath or die "can't open $filename";
say "-----\nProcessing $filename\n----";
my $header_line = <$fh_in>;
$csv_in->parse($header_line);
my $headers = [$csv_in->fields];
while( my $row_line = <$fh_in> ){
$csv_in->parse($row_line);
my $row = [$csv_in->fields];
my @agencies = guess_agency($headers, $row, $filename);
foreach my $agency(@agencies) {
my $target_dir = agency_dir($output_path, $agency);
my $output_fh = output_fh($target_dir, $filename, $header_line);
print $output_fh $row_line;
}
}
$fh_in->close();
}
#just copy feed_info.txt
my $feed_info_path = join('/', ($container_path, 'feed_info.txt'));
my %agencies_hash = reverse %route_to_agency;
foreach my $agency(keys %agencies_hash){
my $target_dir = agency_dir($output_path, $agency);
my $agency_feed_info = join '/', ($target_dir, 'feed_info.txt');
copy($feed_info_path, $agency_feed_info);
}
#warn Dumper(\%stop_to_agency);
exit();
sub guess_agency {
my ($headers, $row, $filename) = @_;
my %record = ();
for(my $i=0; $i < scalar(@$headers); $i++){
$record{$headers->[$i]} = $row->[$i];
}
given($filename){
when('agency.txt'){
return $record{agency_id};
}
when('routes.txt'){
$route_to_agency{$record{route_id}} = $record{agency_id};
return $route_to_agency{$record{route_id}}
}
when('trips.txt'){
my $route_id = $record{route_id};
my $agency_id = $route_to_agency{$route_id} or die "no agency";
$trip_to_agency{$record{trip_id}} = $agency_id;
$service_to_agency{$record{service_id}} = $agency_id;
return $agency_id;
}
when('stop_times.txt'){
my $trip_agency = $trip_to_agency{$record{trip_id}};
my $stop_id = $record{stop_id};
my $existing_agencies = $stop_to_agency{$stop_id} // [];
unless($trip_agency ~~ $existing_agencies){
push @$existing_agencies, $trip_agency;
$stop_to_agency{$stop_id} = $existing_agencies;
}
return $trip_agency;
}
default {
#try trip_id, then service
if($record{trip_id} && defined $trip_to_agency{$record{trip_id}}){
return $trip_to_agency{$record{trip_id}};
}
elsif($record{service_id} && defined $service_to_agency{$record{service_id}}){
return $service_to_agency{$record{service_id}};
}
elsif($record{stop_id} && defined $stop_to_agency{$record{stop_id}}){
return @{$stop_to_agency{$record{stop_id}}};
}
else{
#warn "can't process record from $filename with details ".Dumper(\%record);
warn "can't process record from $filename";
return;
}
}
}
}
sub agency_dir{
my ($agency, $output_dir) = @_;
return join '/', ($agency,$output_dir);
}
sub output_fh{
my ($dir, $filename, $header_line) = @_;
mkpath($dir); #make dir if we don't have it already
my $output_path = join('/', ($dir, $filename));
eval{
open my $dummy, '<', $output_path or die "can\'t open $output_path";
};
if($@){
open my $fh, '>', $output_path or die "can\'t open $output_path";
print $fh $header_line;
return $fh;
}
else {
open my $fh, '>>', $output_path or die "can\'t open $output_path";
return $fh;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment