Skip to content

Instantly share code, notes, and snippets.

@voising
Forked from jico/crawl.rb
Last active December 19, 2015 06:18
Show Gist options
  • Save voising/5910072 to your computer and use it in GitHub Desktop.
Save voising/5910072 to your computer and use it in GitHub Desktop.
Crawl website links #crawl
require 'net/http'
require 'uri'
if ARGV[0].nil?
puts "Usage : #{$0} url [output] "
exit
end
class Crawler
attr_reader :url_list
def initialize
@url_list = []
end
def crawl(url)
urls = extract_urls(url)
queue = urls
while !queue.empty?
puts "analysing #{queue[0]}..\n"
children = extract_urls(queue.shift)
children.each { |u| queue.push(u) unless queue.include? u }
queue.each { |u| @url_list << u unless @url_list.include? u }
break if @url_list.count >= 1000
end
end
def extract_urls(url)
uri = URI.parse(url)
response = Net::HTTP.get_response(uri)
matches = response.body.scan(/<a .*href=["'](http[^s].*?)["']/)
return matches.flatten
end
end
crawler = Crawler.new
crawler.crawl(ARGV[0])
puts "\n\nLINKS ARE :\n"
puts crawler.url_list
if (!ARGV[1].nil?)
File.open(ARGV[1], 'w') {|f| f.write(crawler.url_list) }
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment