-
-
Save nicolewhite/167828e51d8f2b6fad75 to your computer and use it in GitHub Desktop.
import tweepy | |
from tweepy import Cursor | |
import unicodecsv | |
from unidecode import unidecode | |
# Authentication and connection to Twitter API. | |
consumer_key = "" | |
consumer_secret = "" | |
access_key = "" | |
access_secret = "" | |
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) | |
auth.set_access_token(access_key, access_secret) | |
api = tweepy.API(auth) | |
# Usernames whose tweets we want to gather. | |
users = ["lisamurkowski", | |
"SenJohnMcCain", | |
"JeffFlake", | |
"SenMarkPryor", | |
"JohnBoozman", | |
"SenFeinstein", | |
"SenatorBoxer", | |
"MarkUdall", | |
"SenBennetCO", | |
"ChrisMurphyCT", | |
"SenBlumenthal", | |
"SenatorCarper", | |
"ChrisCoons", | |
"marcorubio", | |
"SaxbyChambliss", | |
"brianschatz", | |
"maziehirono", | |
"MikeCrapo", | |
"SenatorRisch", | |
"SenatorDurbin", | |
"SenDonnelly", | |
"SenDanCoats", | |
"ChuckGrassley", | |
"SenatorHarkin", | |
"SenPatRoberts", | |
"JerryMoran", | |
"SenRandPaul", | |
"SenLandrieu", | |
"DavidVitter", | |
"SenatorBarb", | |
"MarkeyMemo", | |
"stabenow", | |
"amyklobuchar", | |
"SenatorWicker", | |
"clairecmc", | |
"RoyBlunt", | |
"jontester", | |
"SenatorFischer", | |
"SenatorReid", | |
"SenDeanHeller", | |
"SenatorShaheen", | |
"kellyayotte", | |
"CoryBooker", | |
"SenatorMenendez", | |
"MartinHeinrich", | |
"SenatorTomUdall", | |
"SenSchumer", | |
"SenGillibrand", | |
"SenatorBurr", | |
"SenatorHagan", | |
"SenatorHeitkamp", | |
"SenJohnHoeven", | |
"SenSherrodBrown", | |
"robportman", | |
"jiminhofe", | |
"RonWyden", | |
"SenToomey", | |
"SenJackReed", | |
"SenWhitehouse", | |
"GrahamBlog", | |
"SenatorTimScott", | |
"SenJohnThune", | |
"SenTedCruz", | |
"SenMikeLee", | |
"SenatorLeahy", | |
"timkaine", | |
"SenRockefeller", | |
"Sen_JoeManchin", | |
"SenRonJohnson", | |
"SenatorEnzi", | |
"SenJohnBarrasso"] | |
with open('tweets.csv', 'wb') as file: | |
writer = unicodecsv.writer(file, delimiter = ',', quotechar = '"') | |
# Write header row. | |
writer.writerow(["politician_name", | |
"politician_username", | |
"politician_followers_count", | |
"politician_listed_count", | |
"politician_following", | |
"politician_favorites", | |
"politician_verified", | |
"politician_default_profile", | |
"politician_location", | |
"politician_time_zone", | |
"politician_statuses_count", | |
"politician_description", | |
"politician_geo_enabled", | |
"politician_contributors_enabled", | |
"tweet_year", | |
"tweet_month", | |
"tweet_day", | |
"tweet_hour", | |
"tweet_text", | |
"tweet_lat", | |
"tweet_long", | |
"tweet_source", | |
"tweet_in_reply_to_screen_name", | |
"tweet_direct_reply", | |
"tweet_retweet_status", | |
"tweet_retweet_count", | |
"tweet_favorite_count", | |
"tweet_hashtags", | |
"tweet_hashtags_count", | |
"tweet_urls", | |
"tweet_urls_count", | |
"tweet_user_mentions", | |
"tweet_user_mentions_count", | |
"tweet_media_type", | |
"tweet_contributors"]) | |
for user in users: | |
user_obj = api.get_user(user) | |
# Gather info specific to the current user. | |
user_info = [user_obj.name, | |
user_obj.screen_name, | |
user_obj.followers_count, | |
user_obj.listed_count, | |
user_obj.friends_count, | |
user_obj.favourites_count, | |
user_obj.verified, | |
user_obj.default_profile, | |
user_obj.location, | |
user_obj.time_zone, | |
user_obj.statuses_count, | |
user_obj.description, | |
user_obj.geo_enabled, | |
user_obj.contributors_enabled] | |
# Get 1000 most recent tweets for the current user. | |
for tweet in Cursor(api.user_timeline, screen_name = user).items(1000): | |
# Latitude and longitude stored as array of floats within a dictionary. | |
lat = tweet.coordinates['coordinates'][1] if tweet.coordinates != None else None | |
long = tweet.coordinates['coordinates'][0] if tweet.coordinates != None else None | |
# If tweet is not in reply to a screen name, it is not a direct reply. | |
direct_reply = True if tweet.in_reply_to_screen_name != "" else False | |
# Retweets start with "RT ..." | |
retweet_status = True if tweet.text[0:3] == "RT " else False | |
# Get info specific to the current tweet of the current user. | |
tweet_info = [tweet.created_at.year, | |
tweet.created_at.month, | |
tweet.created_at.day, | |
tweet.created_at.hour, | |
unidecode(tweet.text), | |
lat, | |
long, | |
tweet.source, | |
tweet.in_reply_to_screen_name, | |
direct_reply, | |
retweet_status, | |
tweet.retweet_count, | |
tweet.favorite_count] | |
# Below entities are stored as variable-length dictionaries, if present. | |
hashtags = [] | |
hashtags_data = tweet.entities.get('hashtags', None) | |
if(hashtags_data != None): | |
for i in range(len(hashtags_data)): | |
hashtags.append(unidecode(hashtags_data[i]['text'])) | |
urls = [] | |
urls_data = tweet.entities.get('urls', None) | |
if(urls_data != None): | |
for i in range(len(urls_data)): | |
urls.append(unidecode(urls_data[i]['url'])) | |
user_mentions = [] | |
user_mentions_data = tweet.entities.get('user_mentions', None) | |
if(user_mentions_data != None): | |
for i in range(len(user_mentions_data)): | |
user_mentions.append(unidecode(user_mentions_data[i]['screen_name'])) | |
media = [] | |
media_data = tweet.entities.get('media', None) | |
if(media_data != None): | |
for i in range(len(media_data)): | |
media.append(unidecode(media_data[i]['type'])) | |
contributors = [] | |
if(tweet.contributors != None): | |
for contributor in tweet.contributors: | |
contributors.append(unidecode(contributor['screen_name'])) | |
more_tweet_info = [', '.join(hashtags), | |
len(hashtags), | |
', '.join(urls), | |
len(urls), | |
', '.join(user_mentions), | |
len(user_mentions), | |
', '.join(media), | |
', '.join(contributors)] | |
# Write data to CSV. | |
writer.writerow(user_info + tweet_info + more_tweet_info) | |
# Show progress. | |
print("Wrote tweets by %s to CSV." % user) |
Excellent contribution. Thank you very much NicoleWhite
How do I get more than 3200 tweets in the same .csv file?
I understand that tweepy has a limit of requirements for every fifteen minutes. However, I'd like to get more tweets in the same file.
If you can't do that, how do I extract the last 3200 tweets and then in another file the remaining ones without repeating?
This is awesome, thank you so much! How would I add a try-except function to catch errors when either a user has been removed or I reached my API limit? I want to run this for hours on end without having to check on it.
I have written the code and put it through my mac terminal. A CSV file has been created but only with the headings and no tweets? Could anyone tell me why this is?
Excellent, thanks for this. I was wondering if you knew how to change the length of the "tweet_text" part of the output so as to allow for the entire length of the tweet to be visible in the CSV. Also, do you know a similar method to the one presented in the code here that would allow to capture tweets not based on accounts, but based on search terms such as hashtags?