Created
August 29, 2014 16:42
-
-
Save nicolewhite/167828e51d8f2b6fad75 to your computer and use it in GitHub Desktop.
Get politicians' tweets and put into csv.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tweepy | |
from tweepy import Cursor | |
import unicodecsv | |
from unidecode import unidecode | |
# Authentication and connection to Twitter API. | |
consumer_key = "" | |
consumer_secret = "" | |
access_key = "" | |
access_secret = "" | |
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) | |
auth.set_access_token(access_key, access_secret) | |
api = tweepy.API(auth) | |
# Usernames whose tweets we want to gather. | |
users = ["lisamurkowski", | |
"SenJohnMcCain", | |
"JeffFlake", | |
"SenMarkPryor", | |
"JohnBoozman", | |
"SenFeinstein", | |
"SenatorBoxer", | |
"MarkUdall", | |
"SenBennetCO", | |
"ChrisMurphyCT", | |
"SenBlumenthal", | |
"SenatorCarper", | |
"ChrisCoons", | |
"marcorubio", | |
"SaxbyChambliss", | |
"brianschatz", | |
"maziehirono", | |
"MikeCrapo", | |
"SenatorRisch", | |
"SenatorDurbin", | |
"SenDonnelly", | |
"SenDanCoats", | |
"ChuckGrassley", | |
"SenatorHarkin", | |
"SenPatRoberts", | |
"JerryMoran", | |
"SenRandPaul", | |
"SenLandrieu", | |
"DavidVitter", | |
"SenatorBarb", | |
"MarkeyMemo", | |
"stabenow", | |
"amyklobuchar", | |
"SenatorWicker", | |
"clairecmc", | |
"RoyBlunt", | |
"jontester", | |
"SenatorFischer", | |
"SenatorReid", | |
"SenDeanHeller", | |
"SenatorShaheen", | |
"kellyayotte", | |
"CoryBooker", | |
"SenatorMenendez", | |
"MartinHeinrich", | |
"SenatorTomUdall", | |
"SenSchumer", | |
"SenGillibrand", | |
"SenatorBurr", | |
"SenatorHagan", | |
"SenatorHeitkamp", | |
"SenJohnHoeven", | |
"SenSherrodBrown", | |
"robportman", | |
"jiminhofe", | |
"RonWyden", | |
"SenToomey", | |
"SenJackReed", | |
"SenWhitehouse", | |
"GrahamBlog", | |
"SenatorTimScott", | |
"SenJohnThune", | |
"SenTedCruz", | |
"SenMikeLee", | |
"SenatorLeahy", | |
"timkaine", | |
"SenRockefeller", | |
"Sen_JoeManchin", | |
"SenRonJohnson", | |
"SenatorEnzi", | |
"SenJohnBarrasso"] | |
with open('tweets.csv', 'wb') as file: | |
writer = unicodecsv.writer(file, delimiter = ',', quotechar = '"') | |
# Write header row. | |
writer.writerow(["politician_name", | |
"politician_username", | |
"politician_followers_count", | |
"politician_listed_count", | |
"politician_following", | |
"politician_favorites", | |
"politician_verified", | |
"politician_default_profile", | |
"politician_location", | |
"politician_time_zone", | |
"politician_statuses_count", | |
"politician_description", | |
"politician_geo_enabled", | |
"politician_contributors_enabled", | |
"tweet_year", | |
"tweet_month", | |
"tweet_day", | |
"tweet_hour", | |
"tweet_text", | |
"tweet_lat", | |
"tweet_long", | |
"tweet_source", | |
"tweet_in_reply_to_screen_name", | |
"tweet_direct_reply", | |
"tweet_retweet_status", | |
"tweet_retweet_count", | |
"tweet_favorite_count", | |
"tweet_hashtags", | |
"tweet_hashtags_count", | |
"tweet_urls", | |
"tweet_urls_count", | |
"tweet_user_mentions", | |
"tweet_user_mentions_count", | |
"tweet_media_type", | |
"tweet_contributors"]) | |
for user in users: | |
user_obj = api.get_user(user) | |
# Gather info specific to the current user. | |
user_info = [user_obj.name, | |
user_obj.screen_name, | |
user_obj.followers_count, | |
user_obj.listed_count, | |
user_obj.friends_count, | |
user_obj.favourites_count, | |
user_obj.verified, | |
user_obj.default_profile, | |
user_obj.location, | |
user_obj.time_zone, | |
user_obj.statuses_count, | |
user_obj.description, | |
user_obj.geo_enabled, | |
user_obj.contributors_enabled] | |
# Get 1000 most recent tweets for the current user. | |
for tweet in Cursor(api.user_timeline, screen_name = user).items(1000): | |
# Latitude and longitude stored as array of floats within a dictionary. | |
lat = tweet.coordinates['coordinates'][1] if tweet.coordinates != None else None | |
long = tweet.coordinates['coordinates'][0] if tweet.coordinates != None else None | |
# If tweet is not in reply to a screen name, it is not a direct reply. | |
direct_reply = True if tweet.in_reply_to_screen_name != "" else False | |
# Retweets start with "RT ..." | |
retweet_status = True if tweet.text[0:3] == "RT " else False | |
# Get info specific to the current tweet of the current user. | |
tweet_info = [tweet.created_at.year, | |
tweet.created_at.month, | |
tweet.created_at.day, | |
tweet.created_at.hour, | |
unidecode(tweet.text), | |
lat, | |
long, | |
tweet.source, | |
tweet.in_reply_to_screen_name, | |
direct_reply, | |
retweet_status, | |
tweet.retweet_count, | |
tweet.favorite_count] | |
# Below entities are stored as variable-length dictionaries, if present. | |
hashtags = [] | |
hashtags_data = tweet.entities.get('hashtags', None) | |
if(hashtags_data != None): | |
for i in range(len(hashtags_data)): | |
hashtags.append(unidecode(hashtags_data[i]['text'])) | |
urls = [] | |
urls_data = tweet.entities.get('urls', None) | |
if(urls_data != None): | |
for i in range(len(urls_data)): | |
urls.append(unidecode(urls_data[i]['url'])) | |
user_mentions = [] | |
user_mentions_data = tweet.entities.get('user_mentions', None) | |
if(user_mentions_data != None): | |
for i in range(len(user_mentions_data)): | |
user_mentions.append(unidecode(user_mentions_data[i]['screen_name'])) | |
media = [] | |
media_data = tweet.entities.get('media', None) | |
if(media_data != None): | |
for i in range(len(media_data)): | |
media.append(unidecode(media_data[i]['type'])) | |
contributors = [] | |
if(tweet.contributors != None): | |
for contributor in tweet.contributors: | |
contributors.append(unidecode(contributor['screen_name'])) | |
more_tweet_info = [', '.join(hashtags), | |
len(hashtags), | |
', '.join(urls), | |
len(urls), | |
', '.join(user_mentions), | |
len(user_mentions), | |
', '.join(media), | |
', '.join(contributors)] | |
# Write data to CSV. | |
writer.writerow(user_info + tweet_info + more_tweet_info) | |
# Show progress. | |
print("Wrote tweets by %s to CSV." % user) |
This is awesome, thank you so much! How would I add a try-except function to catch errors when either a user has been removed or I reached my API limit? I want to run this for hours on end without having to check on it.
I have written the code and put it through my mac terminal. A CSV file has been created but only with the headings and no tweets? Could anyone tell me why this is?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Excellent contribution. Thank you very much NicoleWhite
How do I get more than 3200 tweets in the same .csv file?
I understand that tweepy has a limit of requirements for every fifteen minutes. However, I'd like to get more tweets in the same file.
If you can't do that, how do I extract the last 3200 tweets and then in another file the remaining ones without repeating?