From 80b15b994c3f073d97a64fe79c8a4fd6ce2ae016 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Tue, 12 May 2020 20:00:23 -0600 Subject: [PATCH 01/83] JM: Labs updates. --- searchtweets/__init__.py | 2 +- searchtweets/_version.py | 4 +- searchtweets/api_utils.py | 192 ++++++++++++++------------------- searchtweets/credentials.py | 82 ++++---------- searchtweets/result_stream.py | 197 ++++++++++++++++------------------ searchtweets/utils.py | 37 ++++--- 6 files changed, 217 insertions(+), 297 deletions(-) diff --git a/searchtweets/__init__.py b/searchtweets/__init__.py index d68af5a..db3cb47 100644 --- a/searchtweets/__init__.py +++ b/searchtweets/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2018 Twitter, Inc. +# Copyright 2020 Twitter, Inc. # Licensed under the MIT License # https://opensource.org/licenses/MIT from .result_stream import ResultStream, collect_results diff --git a/searchtweets/_version.py b/searchtweets/_version.py index ebbfa45..4eadb43 100644 --- a/searchtweets/_version.py +++ b/searchtweets/_version.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 2018 Twitter, Inc. +# Copyright 2020 Twitter, Inc. # Licensed under the MIT License # https://opensource.org/licenses/MIT -VERSION = "1.7.4" +VERSION = "1.0.0" diff --git a/searchtweets/api_utils.py b/searchtweets/api_utils.py index c61392a..b452b25 100644 --- a/searchtweets/api_utils.py +++ b/searchtweets/api_utils.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 2018 Twitter, Inc. +# Copyright 2020 Twitter, Inc. # Licensed under the MIT License # https://opensource.org/licenses/MIT """ @@ -9,27 +9,32 @@ import re import datetime +from dateutil.relativedelta import * import logging try: import ujson as json except ImportError: import json -__all__ = ["gen_rule_payload", "gen_params_from_config", - "infer_endpoint", "convert_utc_time", - "validate_count_api", "change_to_count_endpoint"] +__all__ = ["gen_request_parameters", + "gen_params_from_config", + "convert_utc_time"] logger = logging.getLogger(__name__) def convert_utc_time(datetime_str): """ - Handles datetime argument conversion to the GNIP API format, which is - `YYYYMMDDHHSS`. Flexible passing of date formats in the following types:: + Handles datetime argument conversion to the Labs API format, which is + `YYYY-MM-DDTHH:mm:ssZ`. + Flexible passing of date formats in the following types:: - YYYYmmDDHHMM - YYYY-mm-DD - YYYY-mm-DD HH:MM - YYYY-mm-DDTHH:MM + #Coming soon: + - 3d + -12h Args: datetime_str (str): valid formats are listed above. @@ -48,92 +53,78 @@ def convert_utc_time(datetime_str): >>> convert_utc_time("2017-08-02T00:00") '201708020000' """ + if not datetime_str: return None - if not set(['-', ':']) & set(datetime_str): - _date = datetime.datetime.strptime(datetime_str, "%Y%m%d%H%M") - else: - try: - if "T" in datetime_str: - # command line with 'T' - datetime_str = datetime_str.replace('T', ' ') + try: + if len(datetime_str) <= 5: + _date = datetime.datetime.utcnow() + #parse out numeric character. + num = float(datetime_str[:-1]) + if 'd' in datetime_str: + _date = (_date + relativedelta(days=-num)) + elif 'h' in datetime_str: + _date = (_date + relativedelta(hours=-num)) + elif 'm' in datetime_str: + _date = (_date + relativedelta(minutes=-num)) + elif not set(['-', ':']) & set(datetime_str): + _date = datetime.datetime.strptime(datetime_str, "%Y%m%d%H%M") + elif 'T' in datetime_str: + # command line with 'T' + datetime_str = datetime_str.replace('T', ' ') _date = datetime.datetime.strptime(datetime_str, "%Y-%m-%d %H:%M") - except ValueError: - _date = datetime.datetime.strptime(datetime_str, "%Y-%m-%d") - return _date.strftime("%Y%m%d%H%M") - - -def change_to_count_endpoint(endpoint): - """Utility function to change a normal endpoint to a ``count`` api - endpoint. Returns the same endpoint if it's already a valid count endpoint. - Args: - endpoint (str): your api endpoint - - Returns: - str: the modified endpoint for a count endpoint. - """ - - tokens = filter(lambda x: x != '', re.split("[/:]", endpoint)) - filt_tokens = list(filter(lambda x: x != "https", tokens)) - last = filt_tokens[-1].split('.')[0] # removes .json on the endpoint - filt_tokens[-1] = last # changes from *.json -> '' for changing input - if last == 'counts': - return endpoint - else: - return "https://" + '/'.join(filt_tokens) + '/' + "counts.json" + except ValueError: + _date = datetime.datetime.strptime(datetime_str, "%Y-%m-%d") + return _date.strftime("%Y-%m-%dT%H:%M:%SZ") -def gen_rule_payload(pt_rule, results_per_call=None, - from_date=None, to_date=None, count_bucket=None, - tag=None, - stringify=True): +def gen_request_parameters(query, results_per_call=None, + start_time=None, end_time=None, since_id=None, until_id=None, + tweet_fields=None, + stringify=True): """ - Generates the dict or json payload for a PowerTrack rule. + Generates the dict or json payload for a search query. Args: - pt_rule (str): The string version of a powertrack rule, - e.g., "beyonce has:geo". Accepts multi-line strings + query (str): The string version of a search query, + e.g., "snow has:media -is:retweet". Accepts multi-line strings for ease of entry. results_per_call (int): number of tweets or counts returned per API - call. This maps to the ``maxResults`` search API parameter. - Defaults to 500 to reduce API call usage. - from_date (str or None): Date format as specified by + call. This maps to the `max_results`` search API parameter. + Defaults to 100 (maximum supported in Labs). + start_time (str or None): Date format as specified by `convert_utc_time` for the starting time of your search. - to_date (str or None): date format as specified by `convert_utc_time` + end_time (str or None): date format as specified by `convert_utc_time` for the end time of your search. - count_bucket (str or None): If using the counts api endpoint, - will define the count bucket for which tweets are aggregated. + tweet_fields (string): comma-delimted list of Tweet JSON attributes wanted in endpoint responses. Default is "id,created_at,text"). stringify (bool): specifies the return type, `dict` or json-formatted `str`. Example: - >>> from searchtweets.utils import gen_rule_payload - >>> gen_rule_payload("beyonce has:geo", - ... from_date="2017-08-21", - ... to_date="2017-08-22") - '{"query":"beyonce has:geo","maxResults":100,"toDate":"201708220000","fromDate":"201708210000"}' + >>> from searchtweets.utils import gen_request_parameters + >>> gen_request_parameters("snow has:media -is:retweet", + ... from_date="2020-02-18", + ... to_date="2020-02-21") + '{"query":"snow has:media -is:retweet","max_results":100,"start_time":"202002180000","end_time":"202002210000"}' """ - pt_rule = ' '.join(pt_rule.split()) # allows multi-line strings - payload = {"query": pt_rule} + #Set endpoint request parameter to command-line arguments. This is where 'translation' happens. + query = ' '.join(query.split()) # allows multi-line strings + payload = {"query": query} if results_per_call is not None and isinstance(results_per_call, int) is True: - payload["maxResults"] = results_per_call - if to_date: - payload["toDate"] = convert_utc_time(to_date) - if from_date: - payload["fromDate"] = convert_utc_time(from_date) - if count_bucket: - if set(["day", "hour", "minute"]) & set([count_bucket]): - payload["bucket"] = count_bucket - del payload["maxResults"] - else: - logger.error("invalid count bucket: provided {}" - .format(count_bucket)) - raise ValueError - if tag: - payload["tag"] = tag + payload["max_results"] = results_per_call + if start_time: + payload["start_time"] = convert_utc_time(start_time) + if end_time: + payload["end_time"] = convert_utc_time(end_time) + if since_id: + payload["since_id"] = since_id + if until_id: + payload["until_id"] = until_id + if tweet_fields: + payload["tweet.fields"] = tweet_fields return json.dumps(payload) if stringify else payload @@ -143,13 +134,13 @@ def gen_params_from_config(config_dict): Generates parameters for a ResultStream from a dictionary. """ - if config_dict.get("count_bucket"): - logger.warning("change your endpoint to the count endpoint; this is " - "default behavior when the count bucket " - "field is defined") - endpoint = change_to_count_endpoint(config_dict.get("endpoint")) - else: - endpoint = config_dict.get("endpoint") + # if config_dict.get("count_bucket"): + # logger.warning("change your endpoint to the count endpoint; this is " + # "default behavior when the count bucket " + # "field is defined") + # endpoint = change_to_count_endpoint(config_dict.get("endpoint")) + # else: + endpoint = config_dict.get("endpoint") def intify(arg): @@ -158,50 +149,29 @@ def intify(arg): else: return arg - # this parameter comes in as a string when it's parsed + # This numeric parameter comes in as a string when it's parsed results_per_call = intify(config_dict.get("results_per_call", None)) - rule = gen_rule_payload(pt_rule=config_dict["pt_rule"], - from_date=config_dict.get("from_date", None), - to_date=config_dict.get("to_date", None), - results_per_call=results_per_call, - count_bucket=config_dict.get("count_bucket", None)) + query = gen_request_parameters(query=config_dict["query"], + start_time=config_dict.get("start_time", None), + end_time=config_dict.get("end_time", None), + since_id=config_dict.get("since_id", None), + until_id=config_dict.get("until_id", None), + tweet_fields=config_dict.get("tweet_fields", None), + results_per_call=results_per_call) + #count_bucket=config_dict.get("count_bucket", None)) _dict = {"endpoint": endpoint, - "username": config_dict.get("username"), - "password": config_dict.get("password"), "bearer_token": config_dict.get("bearer_token"), "extra_headers_dict": config_dict.get("extra_headers_dict",None), - "rule_payload": rule, + "request_parameters": query, "results_per_file": intify(config_dict.get("results_per_file")), - "max_results": intify(config_dict.get("max_results")), + "max_tweets": intify(config_dict.get("max_tweets")), "max_pages": intify(config_dict.get("max_pages", None))} + return _dict -def infer_endpoint(rule_payload): - """ - Infer which endpoint should be used for a given rule payload. - """ - bucket = (rule_payload if isinstance(rule_payload, dict) - else json.loads(rule_payload)).get("bucket") - return "counts" if bucket else "search" -def validate_count_api(rule_payload, endpoint): - """ - Ensures that the counts api is set correctly in a payload. - """ - rule = (rule_payload if isinstance(rule_payload, dict) - else json.loads(rule_payload)) - bucket = rule.get('bucket') - counts = set(endpoint.split("/")) & {"counts.json"} - if len(counts) == 0: - if bucket is not None: - msg = ("""There is a count bucket present in your payload, - but you are using not using the counts API. - Please check your endpoints and try again""") - logger.error(msg) - raise ValueError - diff --git a/searchtweets/credentials.py b/searchtweets/credentials.py index 081c5db..309544c 100644 --- a/searchtweets/credentials.py +++ b/searchtweets/credentials.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 2017 Twitter, Inc. +# Copyright 2020 Twitter, Inc. # Licensed under the Apache License, Version 2.0 # http://www.apache.org/licenses/LICENSE-2.0 """This module handles credential management and parsing for the API. As we @@ -45,11 +45,7 @@ def _load_yaml_credentials(filename=None, yaml_key=None): def _load_env_credentials(): vars_ = ["SEARCHTWEETS_ENDPOINT", - "SEARCHTWEETS_ACCOUNT", - "SEARCHTWEETS_USERNAME", - "SEARCHTWEETS_PASSWORD", "SEARCHTWEETS_BEARER_TOKEN", - "SEARCHTWEETS_ACCOUNT_TYPE", "SEARCHTWEETS_CONSUMER_KEY", "SEARCHTWEETS_CONSUMER_SECRET" ] @@ -60,44 +56,22 @@ def _load_env_credentials(): return parsed -def _parse_credentials(search_creds, account_type): - - if account_type is None: - account_type = search_creds.get("account_type", None) - # attempt to infer account type - if account_type is None: - if search_creds.get("bearer_token") is not None: - account_type = "premium" - elif search_creds.get("password") is not None: - account_type = "enterprise" - else: - pass - - if account_type not in {"premium", "enterprise"}: - msg = """Account type is not specified and cannot be inferred. - Please check your credential file, arguments, or environment variables - for issues. The account type must be 'premium' or 'enterprise'. - """ - logger.error(msg) - raise KeyError +def _parse_credentials(search_creds, api_version=None): try: - if account_type == "premium": - if "bearer_token" not in search_creds: - if "consumer_key" in search_creds \ - and "consumer_secret" in search_creds: - search_creds["bearer_token"] = _generate_bearer_token( - search_creds["consumer_key"], - search_creds["consumer_secret"]) - - search_args = { - "bearer_token": search_creds["bearer_token"], - "endpoint": search_creds["endpoint"], - "extra_headers_dict": search_creds.get("extra_headers",None)} - if account_type == "enterprise": - search_args = {"username": search_creds["username"], - "password": search_creds["password"], - "endpoint": search_creds["endpoint"]} + + if "bearer_token" not in search_creds: + if "consumer_key" in search_creds \ + and "consumer_secret" in search_creds: + search_creds["bearer_token"] = _generate_bearer_token( + search_creds["consumer_key"], + search_creds["consumer_secret"]) + + search_args = { + "bearer_token": search_creds["bearer_token"], + "endpoint": search_creds["endpoint"], + "extra_headers_dict": search_creds.get("extra_headers",None)} + except KeyError: logger.error("Your credentials are not configured correctly and " " you are missing a required field. Please see the " @@ -106,8 +80,7 @@ def _parse_credentials(search_creds, account_type): return search_args - -def load_credentials(filename=None, account_type=None, +def load_credentials(filename=None, yaml_key=None, env_overwrite=True): """ Handles credential management. Supports both YAML files and environment @@ -118,12 +91,9 @@ def load_credentials(filename=None, account_type=None, : endpoint: - username: - password: consumer_key: consumer_secret: bearer_token: - account_type: extra_headers: : @@ -136,10 +106,8 @@ def load_credentials(filename=None, account_type=None, .. code: yaml SEARCHTWEETS_ENDPOINT - SEARCHTWEETS_USERNAME - SEARCHTWEETS_PASSWORD SEARCHTWEETS_BEARER_TOKEN - SEARCHTWEETS_ACCOUNT_TYPE + SEARCHTWEETS_API_VERSION ... Again, set the variables that correspond to your account information and @@ -149,8 +117,8 @@ def load_credentials(filename=None, account_type=None, Args: filename (str): pass a filename here if you do not want to use the default ``~/.twitter_keys.yaml`` - account_type (str): your account type, "premium" or "enterprise". We - will attempt to infer the account info if left empty. + api_version (str): API version, "labs_v1" or "labs_v2". We + will attempt to infer the version info if left empty. yaml_key (str): the top-level key in the YAML file that has your information. Defaults to ``search_tweets_api``. env_overwrite: any found environment variables will overwrite values @@ -161,18 +129,13 @@ def load_credentials(filename=None, account_type=None, Example: >>> from searchtweets.api_utils import load_credentials - >>> search_args = load_credentials(account_type="premium", - env_overwrite=False) + >>> search_args = load_credentials(env_overwrite=False) >>> search_args.keys() dict_keys(['bearer_token', 'endpoint']) >>> import os >>> os.environ["SEARCHTWEETS_ENDPOINT"] = "https://endpoint" - >>> os.environ["SEARCHTWEETS_USERNAME"] = "areallybadpassword" - >>> os.environ["SEARCHTWEETS_PASSWORD"] = "" >>> load_credentials() - {'endpoint': 'https://endpoint', - 'password': '', - 'username': 'areallybadpassword'} + {'endpoint': 'https://endpoint'} """ yaml_key = yaml_key if yaml_key is not None else "search_tweets_api" @@ -186,7 +149,7 @@ def load_credentials(filename=None, account_type=None, merged_vars = (merge_dicts(yaml_vars, env_vars) if env_overwrite else merge_dicts(env_vars, yaml_vars)) - parsed_vars = _parse_credentials(merged_vars, account_type=account_type) + parsed_vars = _parse_credentials(merged_vars) return parsed_vars @@ -204,3 +167,4 @@ def _generate_bearer_token(consumer_key, consumer_secret): resp.raise_for_status() return resp.json()['access_token'] + diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index dcc995c..8d4eccf 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -1,18 +1,17 @@ # -*- coding: utf-8 -*- -# Copyright 2018 Twitter, Inc. +# Copyright 2020 Twitter, Inc. # Licensed under the MIT License # https://opensource.org/licenses/MIT """ This module contains the request handing and actual API wrapping functionality. - Its core method is the ``ResultStream`` object, which takes the API call arguments and returns a stream of results to the user. """ import time -import re import logging import requests +from urllib.parse import urlencode try: import ujson as json except ImportError: @@ -21,25 +20,19 @@ from .utils import merge_dicts -from .api_utils import infer_endpoint, change_to_count_endpoint - from ._version import VERSION logger = logging.getLogger(__name__) -def make_session(username=None, password=None, bearer_token=None, extra_headers_dict=None): +def make_session(bearer_token=None, extra_headers_dict=None): """Creates a Requests Session for use. Accepts a bearer token - for premiums users and will override username and password information if - present. - + for Labs. Args: - username (str): username for the session - password (str): password for the user - bearer_token (str): token for a premium API user. + bearer_token (str): token for a Labs user. """ - if password is None and bearer_token is None: + if bearer_token is None: logger.error("No authentication information provided; " "please check your object") raise KeyError @@ -47,35 +40,33 @@ def make_session(username=None, password=None, bearer_token=None, extra_headers_ session = requests.Session() session.trust_env = False headers = {'Accept-encoding': 'gzip', - 'User-Agent': 'twitterdev-search-tweets-python/' + VERSION} + 'User-Agent': 'twitterdev-search-tweets-python-labs/' + VERSION} + if bearer_token: logger.info("using bearer token for authentication") headers['Authorization'] = "Bearer {}".format(bearer_token) session.headers = headers - else: - logger.info("using username and password for authentication") - session.auth = username, password - session.headers = headers + if extra_headers_dict: - headers.update(extra_headers_dict) + headers.update(extra_headers_dict) return session - def retry(func): """ - Decorator to handle API retries and exceptions. Defaults to three retries. - + Decorator to handle API retries and exceptions. Defaults to five retries. + Rate-limit (429) and server-side errors (5XX) implement a retry design. + Other 4XX errors are a 'one and done' type error. + Retries implement an exponential backoff... Args: func (function): function for decoration - Returns: decorated function - """ def retried_func(*args, **kwargs): max_tries = 10 tries = 0 total_sleep_seconds = 0 + while True: try: resp = func(*args, **kwargs) @@ -92,26 +83,25 @@ def retried_func(*args, **kwargs): tries += 1 - logger.error(f"HTTP Error code: {resp.status_code}: {resp.text}") - logger.error(f"Request payload: {kwargs['rule_payload']}") + logger.error(f" HTTP Error code: {resp.status_code}: {resp.text} | {resp.reason}") + logger.error(f" Request payload: {kwargs['request_parameters']}") if resp.status_code == 429: - logger.warning("Rate limit hit... Will retry...") - #print("Rate limit hit... Will retry...") - sleep_seconds = min(((tries * 2) ** 2), 900 - total_sleep_seconds) + logger.error("Rate limit hit... Will retry...") + #Expontential backoff, but within a 15-minute (900 seconds) period. No sense in backing off for more than 15 minutes. + sleep_seconds = min(((tries * 2) ** 2), max(900 - total_sleep_seconds,30)) total_sleep_seconds = total_sleep_seconds + sleep_seconds elif resp.status_code >= 500: - logger.warning("Server-side error... Will retry...") - #print("Server-side error... Will retry...") + logger.error("Server-side error... Will retry...") sleep_seconds = 30 else: #Other errors are a "one and done", no use in retrying error... + logger.error('Quitting... ') raise requests.exceptions.HTTPError - # mini exponential backoff here. - logger.warning(f"Will retry in {sleep_seconds} seconds...") - #print(f"Will retry in {sleep_seconds} seconds...") + + logger.error(f"Will retry in {sleep_seconds} seconds...") time.sleep(sleep_seconds) continue @@ -123,20 +113,30 @@ def retried_func(*args, **kwargs): @retry -def request(session, url, rule_payload, **kwargs): +def request(session, url, request_parameters, **kwargs): """ Executes a request with the given payload and arguments. - Args: session (requests.Session): the valid session object url (str): Valid API endpoint - rule_payload (str or dict): rule package for the POST. If you pass a + request_parameters (str or dict): rule package for the POST. If you pass a dictionary, it will be converted into JSON. """ - if isinstance(rule_payload, dict): - rule_payload = json.dumps(rule_payload) + + if isinstance(request_parameters, dict): + request_parameters = json.dumps(request_parameters) logger.debug("sending request") - result = session.post(url, data=rule_payload, **kwargs) + + request_json = json.loads(request_parameters) + + #Using POST command, not yet supported in Labs. + #result = session.post(url, data=request_parameters, **kwargs) + + #New Labs-specific code in support of GET requests. + request_url = urlencode(request_json) + url = f"{url}?{request_url}" + + result = session.get(url, **kwargs) return result @@ -145,49 +145,46 @@ class ResultStream: Class to represent an API query that handles two major functionality pieces: wrapping metadata around a specific API call and automatic pagination of results. - Args: - username (str): username for enterprise customers - password (str): password for enterprise customers - bearer_token (str): bearer token for premium users - endpoint (str): API endpoint; see your console at developer.twitter.com - rule_payload (json or dict): payload for the post request - max_results (int): max number results that will be returned from this + bearer_token (str): bearer token for Labs. + + endpoint (str): API endpoint. + + request_parameters (json or dict): payload for the post request + + max_tweets (int): max number results that will be returned from this instance. Note that this can be slightly lower than the total returned - from the API call - e.g., setting ``max_results = 10`` would return - ten results, but an API call will return at minimum 100 results. + from the API call - e.g., setting ``max_tweets = 10`` would return + ten results, but an API call will return at minimum 100 results by default. + tweetify (bool): If you are grabbing tweets and not counts, use the tweet parser library to convert each raw tweet package to a Tweet with lazy properties. - max_requests (int): A hard cutoff for the number of API calls this - instance will make. Good for testing in sandbox premium environments. - extra_headers_dict (dict): custom headers to add + max_requests (int): A hard cutoff for the number of API calls this + instance will make. Good for testing in Labs environment. + extra_headers_dict (dict): custom headers to add Example: - >>> rs = ResultStream(**search_args, rule_payload=rule, max_pages=1) + >>> rs = ResultStream(**search_args, request_parameters=rule, max_pages=1) >>> results = list(rs.stream()) - """ # leaving this here to have an API call counter for ALL objects in your # session, helping with usage of the convenience functions in the library. session_request_counter = 0 - def __init__(self, endpoint, rule_payload, username=None, password=None, - bearer_token=None, extra_headers_dict=None, max_results=500, + def __init__(self, endpoint, request_parameters, bearer_token=None, extra_headers_dict=None, max_tweets=500, tweetify=True, max_requests=None, **kwargs): - self.username = username - self.password = password self.bearer_token = bearer_token self.extra_headers_dict = extra_headers_dict - if isinstance(rule_payload, str): - rule_payload = json.loads(rule_payload) - self.rule_payload = rule_payload + if isinstance(request_parameters, str): + request_parameters = json.loads(request_parameters) + self.request_parameters = request_parameters self.tweetify = tweetify # magic number of max tweets if you pass a non_int - self.max_results = (max_results if isinstance(max_results, int) - else 10 ** 15) + self.max_tweets = (max_tweets if isinstance(max_tweets, int) + else 10 ** 15) self.total_results = 0 self.n_requests = 0 @@ -199,17 +196,13 @@ def __init__(self, endpoint, rule_payload, username=None, password=None, # magic number of requests! self.max_requests = (max_requests if max_requests is not None else 10 ** 9) - self.endpoint = (change_to_count_endpoint(endpoint) - if infer_endpoint(rule_payload) == "counts" - else endpoint) - # validate_count_api(self.rule_payload, self.endpoint) + self.endpoint = endpoint def stream(self): """ Main entry point for the data from the API. Will automatically paginate - through the results via the ``next`` token and return up to ``max_results`` + through the results via the ``next`` token and return up to ``max_tweets`` tweets or up to ``max_requests`` API calls, whichever is lower. - Usage: >>> result_stream = ResultStream(**kwargs) >>> stream = result_stream.stream() @@ -218,24 +211,27 @@ def stream(self): >>> results = list(ResultStream(**kwargs).stream()) """ self.init_session() - self.check_counts() self.execute_request() self.stream_started = True + while True: + if self.current_tweets == None: + break for tweet in self.current_tweets: - if self.total_results >= self.max_results: + if self.total_results >= self.max_tweets: break yield self._tweet_func(tweet) self.total_results += 1 - if self.next_token and self.total_results < self.max_results and self.n_requests <= self.max_requests: - self.rule_payload = merge_dicts(self.rule_payload, - {"next": self.next_token}) + if self.next_token and self.total_results < self.max_tweets and self.n_requests <= self.max_requests: + self.request_parameters = merge_dicts(self.request_parameters, + {"next_token": self.next_token}) logger.info("paging; total requests read so far: {}" .format(self.n_requests)) self.execute_request() else: break + logger.info("ending stream at {} tweets".format(self.total_results)) self.current_tweets = None self.session.close() @@ -246,19 +242,9 @@ def init_session(self): """ if self.session: self.session.close() - self.session = make_session(self.username, - self.password, - self.bearer_token, + self.session = make_session(self.bearer_token, self.extra_headers_dict) - def check_counts(self): - """ - Disables tweet parsing if the count API is used. - """ - if "counts" in re.split("[/.]", self.endpoint): - logger.info("disabling tweet parsing due to counts API usage") - self._tweet_func = lambda x: x - def execute_request(self): """ Sends the request to the API and parses the json response. @@ -271,52 +257,53 @@ def execute_request(self): resp = request(session=self.session, url=self.endpoint, - rule_payload=self.rule_payload) + request_parameters=self.request_parameters) self.n_requests += 1 ResultStream.session_request_counter += 1 - resp = json.loads(resp.content.decode(resp.encoding)) - self.next_token = resp.get("next", None) - self.current_tweets = resp["results"] + try: + resp = json.loads(resp.content.decode(resp.encoding)) + + meta = resp.get("meta", None) + self.next_token = meta.get("next_token", None) + self.current_tweets = resp.get("data", None) + + except: + print("Error parsing content as JSON.") def __repr__(self): - repr_keys = ["username", "endpoint", "rule_payload", - "tweetify", "max_results"] + repr_keys = ["endpoint", "request_parameters", + "tweetify", "max_tweets"] str_ = json.dumps(dict([(k, self.__dict__.get(k)) for k in repr_keys]), indent=4) str_ = "ResultStream: \n\t" + str_ return str_ - -def collect_results(rule, max_results=500, result_stream_args=None): +def collect_results(query, max_tweets=1000, result_stream_args=None): """ Utility function to quickly get a list of tweets from a ``ResultStream`` without keeping the object around. Requires your args to be configured prior to using. - Args: - rule (str): valid powertrack rule for your account, preferably - generated by the `gen_rule_payload` function. - max_results (int): maximum number of tweets or counts to return from + query (str): valid powertrack rule for your account, preferably + generated by the `gen_request_parameters` function. + max_tweets (int): maximum number of tweets or counts to return from the API / underlying ``ResultStream`` object. result_stream_args (dict): configuration dict that has connection information for a ``ResultStream`` object. - Returns: list of results - Example: >>> from searchtweets import collect_results - >>> tweets = collect_results(rule, - max_results=500, + >>> tweets = collect_results(query, + max_tweets=500, result_stream_args=search_args) - """ if result_stream_args is None: logger.error("This function requires a configuration dict for the " "inner ResultStream object.") raise KeyError - rs = ResultStream(rule_payload=rule, - max_results=max_results, + rs = ResultStream(request_parameters=query, + max_tweets=max_tweets, **result_stream_args) - return list(rs.stream()) + return list(rs.stream()) \ No newline at end of file diff --git a/searchtweets/utils.py b/searchtweets/utils.py index 2efd664..697174d 100644 --- a/searchtweets/utils.py +++ b/searchtweets/utils.py @@ -1,7 +1,7 @@ """ Utility functions that are used in various parts of the program. """ -# Copyright 2018 Twitter, Inc. +# Copyright 2020 Twitter, Inc. # Licensed under the MIT License # https://opensource.org/licenses/MIT @@ -71,10 +71,10 @@ def merge_dicts(*dicts): Example: >>> from searchtweets.utils import merge_dicts - >>> d1 = {"rule": "something has:geo"} - >>> d2 = {"maxResults": 1000} + >>> d1 = {"query": "snow has:media -is:retweet"} + >>> d2 = {"max_tweets": 1000} >>> merge_dicts(*[d1, d2]) - {"maxResults": 1000, "rule": "something has:geo"} + {"max_results": 1000, "rule": "something has:geo"} """ def _merge_dicts(dict1, dict2): merged = dict1.copy() @@ -148,32 +148,31 @@ def read_config(filename): search_rules: from-date: 2017-06-01 to-date: 2017-09-01 01:01 - pt-rule: kanye + query: snow search_params: - results-per-call: 500 - max-results: 500 + results-per-call: 100 + max-tweets: 500 output_params: save_file: True - filename_prefix: kanye + filename_prefix: snow results_per_file: 10000000 or:: - [search_rules] from_date = 2017-06-01 to_date = 2017-09-01 - pt_rule = beyonce has:geo + query = snow has:geo [search_params] - results_per_call = 500 - max_results = 500 + results_per_call = 100 + max_tweets = 500 [output_params] save_file = True - filename_prefix = beyonce + filename_prefix = snow_geo results_per_file = 10000000 Args: @@ -203,10 +202,10 @@ def read_config(filename): # ensure args are renamed correctly: config_dict = {k.replace('-', '_'): v for k, v in config_dict.items()} - # YAML will parse datestrings as datetimes; we'll convert them here if they - # exist - if config_dict.get("to_date") is not None: - config_dict["to_date"] = str(config_dict["to_date"]) - if config_dict.get("from_date") is not None: - config_dict["from_date"] = str(config_dict["from_date"]) + # YAML will parse datestrings as datetimes; we'll convert them here if they exist + + if config_dict.get("start_time") is not None: + config_dict["start_time"] = str(config_dict["start_time"]) + if config_dict.get("end_time") is not None: + config_dict["end_time"] = str(config_dict["end_time"]) return config_dict From bfe79d053cc56e5b991d41b24395ef35dc5a22aa Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Tue, 12 May 2020 20:09:29 -0600 Subject: [PATCH 02/83] JM: moving things around. --- tools/api_config_example.config | 13 -- tools/api_yaml_example.yaml | 13 -- tools/search_tweets.py | 207 -------------------------------- 3 files changed, 233 deletions(-) delete mode 100644 tools/api_config_example.config delete mode 100644 tools/api_yaml_example.yaml delete mode 100644 tools/search_tweets.py diff --git a/tools/api_config_example.config b/tools/api_config_example.config deleted file mode 100644 index 230d731..0000000 --- a/tools/api_config_example.config +++ /dev/null @@ -1,13 +0,0 @@ -[search_rules] -from_date = 2017-06-01 -to_date = 2017-09-01 -pt_rule = beyonce has:geo - -[search_params] -results_per_call = 500 -max_results = 500 - -[output_params] -save_file = True -filename_prefix = beyonce -results_per_file = 10000000 diff --git a/tools/api_yaml_example.yaml b/tools/api_yaml_example.yaml deleted file mode 100644 index d1bf9e6..0000000 --- a/tools/api_yaml_example.yaml +++ /dev/null @@ -1,13 +0,0 @@ -search_rules: - from-date: 2017-06-01 - to-date: 2017-09-01 01:01 - pt-rule: kanye - -search_params: - results-per-call: 500 - max-results: 500 - -output_params: - save_file: True - filename_prefix: kanye - results_per_file: 10000000 diff --git a/tools/search_tweets.py b/tools/search_tweets.py deleted file mode 100644 index c2b699e..0000000 --- a/tools/search_tweets.py +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/env python -# Copyright 2017 Twitter, Inc. -# Licensed under the Apache License, Version 2.0 -# http://www.apache.org/licenses/LICENSE-2.0 -import os -import argparse -import json -import sys -import logging -from searchtweets import (ResultStream, - load_credentials, - merge_dicts, - read_config, - write_result_stream, - gen_params_from_config) - -logger = logging.getLogger() -# we want to leave this here and have it command-line configurable via the -# --debug flag -logging.basicConfig(level=os.environ.get("LOGLEVEL", "ERROR")) - - -REQUIRED_KEYS = {"pt_rule", "endpoint"} - - -def parse_cmd_args(): - argparser = argparse.ArgumentParser() - help_msg = """configuration file with all parameters. Far, - easier to use than the command-line args version., - If a valid file is found, all args will be populated, - from there. Remaining command-line args, - will overrule args found in the config, - file.""" - - argparser.add_argument("--credential-file", - dest="credential_file", - default=None, - help=("Location of the yaml file used to hold " - "your credentials.")) - - argparser.add_argument("--credential-file-key", - dest="credential_yaml_key", - default=None, - help=("the key in the credential file used " - "for this session's credentials. " - "Defaults to search_tweets_api")) - - argparser.add_argument("--env-overwrite", - dest="env_overwrite", - default=True, - help=("""Overwrite YAML-parsed credentials with - any set environment variables. See API docs or - readme for details.""")) - - argparser.add_argument("--config-file", - dest="config_filename", - default=None, - help=help_msg) - - argparser.add_argument("--account-type", - dest="account_type", - default=None, - choices=["premium", "enterprise"], - help="The account type you are using") - - argparser.add_argument("--count-bucket", - dest="count_bucket", - default=None, - help=("""Bucket size for counts API. Options:, - day, hour, minute (default is 'day').""")) - - argparser.add_argument("--start-datetime", - dest="from_date", - default=None, - help="""Start of datetime window, format - 'YYYY-mm-DDTHH:MM' (default: -30 days)""") - - argparser.add_argument("--end-datetime", - dest="to_date", - default=None, - help="""End of datetime window, format - 'YYYY-mm-DDTHH:MM' (default: most recent - date)""") - - argparser.add_argument("--filter-rule", - dest="pt_rule", - default=None, - help="PowerTrack filter rule (See: http://support.gnip.com/customer/portal/articles/901152-powertrack-operators)") - - argparser.add_argument("--results-per-call", - dest="results_per_call", - help="Number of results to return per call " - "(default 100; max 500) - corresponds to " - "'maxResults' in the API") - - argparser.add_argument("--max-results", dest="max_results", - type=int, - help="Maximum number of Tweets or Counts to return for this session") - - argparser.add_argument("--max-pages", - dest="max_pages", - type=int, - default=None, - help="Maximum number of pages/API calls to " - "use for this session.") - - argparser.add_argument("--results-per-file", dest="results_per_file", - default=None, - type=int, - help="Maximum tweets to save per file.") - - argparser.add_argument("--filename-prefix", - dest="filename_prefix", - default=None, - help="prefix for the filename where tweet " - " json data will be stored.") - - argparser.add_argument("--no-print-stream", - dest="print_stream", - action="store_false", - help="disable print streaming") - - argparser.add_argument("--print-stream", - dest="print_stream", - action="store_true", - default=True, - help="Print tweet stream to stdout") - - argparser.add_argument("--extra-headers", - dest="extra_headers", - type=str, - default=None, - help="JSON-formatted str representing a dict of additional request headers") - - argparser.add_argument("--debug", - dest="debug", - action="store_true", - default=False, - help="print all info and warning messages") - return argparser - - -def _filter_sensitive_args(dict_): - sens_args = ("password", "consumer_key", "consumer_secret", "bearer_token") - return {k: v for k, v in dict_.items() if k not in sens_args} - -def main(): - args_dict = vars(parse_cmd_args().parse_args()) - if args_dict.get("debug") is True: - logger.setLevel(logging.DEBUG) - logger.debug("command line args dict:") - logger.debug(json.dumps(args_dict, indent=4)) - - if args_dict.get("config_filename") is not None: - configfile_dict = read_config(args_dict["config_filename"]) - else: - configfile_dict = {} - - extra_headers_str = args_dict.get("extra_headers") - if extra_headers_str is not None: - args_dict['extra_headers_dict'] = json.loads(extra_headers_str) - del args_dict['extra_headers'] - - logger.debug("config file ({}) arguments sans sensitive args:".format(args_dict["config_filename"])) - logger.debug(json.dumps(_filter_sensitive_args(configfile_dict), indent=4)) - - creds_dict = load_credentials(filename=args_dict["credential_file"], - account_type=args_dict["account_type"], - yaml_key=args_dict["credential_yaml_key"], - env_overwrite=args_dict["env_overwrite"]) - - dict_filter = lambda x: {k: v for k, v in x.items() if v is not None} - - config_dict = merge_dicts(dict_filter(configfile_dict), - dict_filter(creds_dict), - dict_filter(args_dict)) - - logger.debug("combined dict (cli, config, creds) sans password:") - logger.debug(json.dumps(_filter_sensitive_args(config_dict), indent=4)) - - if len(dict_filter(config_dict).keys() & REQUIRED_KEYS) < len(REQUIRED_KEYS): - print(REQUIRED_KEYS - dict_filter(config_dict).keys()) - logger.error("ERROR: not enough arguments for the program to work") - sys.exit(1) - - stream_params = gen_params_from_config(config_dict) - logger.debug("full arguments passed to the ResultStream object sans password") - logger.debug(json.dumps(_filter_sensitive_args(stream_params), indent=4)) - - rs = ResultStream(tweetify=False, **stream_params) - - logger.debug(str(rs)) - - if config_dict.get("filename_prefix") is not None: - stream = write_result_stream(rs, - filename_prefix=config_dict.get("filename_prefix"), - results_per_file=config_dict.get("results_per_file")) - else: - stream = rs.stream() - - for tweet in stream: - if config_dict["print_stream"] is True: - print(json.dumps(tweet)) - - -if __name__ == '__main__': - main() From 94227bafc36b4969b9d36cee8cc2bd63b470bb26 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Tue, 12 May 2020 20:13:16 -0600 Subject: [PATCH 03/83] JM: moving things around. --- config/api_yaml_example.yaml | 14 ++ scripts/poll_tweets.py | 249 +++++++++++++++++++++++++++++++++++ scripts/search_tweets.py | 209 +++++++++++++++++++++++++++++ 3 files changed, 472 insertions(+) create mode 100644 config/api_yaml_example.yaml create mode 100644 scripts/poll_tweets.py create mode 100644 scripts/search_tweets.py diff --git a/config/api_yaml_example.yaml b/config/api_yaml_example.yaml new file mode 100644 index 0000000..bf0ca02 --- /dev/null +++ b/config/api_yaml_example.yaml @@ -0,0 +1,14 @@ +search_rules: + start-time: 2020-01-06 + end-time: 2020-01-10 + query: snow colorado -is:retweet has:media + +search_params: + results-per-call: 100 + max-tweets: 10000 + tweet-fields: id,created_at,text + +output_params: + save_file: True + filename_prefix: snow-photos + results_per_file: 100000 diff --git a/scripts/poll_tweets.py b/scripts/poll_tweets.py new file mode 100644 index 0000000..8e6b075 --- /dev/null +++ b/scripts/poll_tweets.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python +# Copyright 2020 Twitter, Inc. +# Licensed under the Apache License, Version 2.0 +# http://www.apache.org/licenses/LICENSE-2.0 +import os +import argparse +import json +import sys +import logging +import time +from searchtweets import (ResultStream, + load_credentials, + merge_dicts, + read_config, + write_result_stream, + gen_params_from_config) + +logger = logging.getLogger() +# we want to leave this here and have it command-line configurable via the +# --debug flag +logging.basicConfig(level=os.environ.get("LOGLEVEL", "ERROR")) + + +REQUIRED_KEYS = {"query", "endpoint"} + + +def parse_cmd_args(): + argparser = argparse.ArgumentParser() + help_msg = """configuration file with all parameters. Far, + easier to use than the command-line args version., + If a valid file is found, all args will be populated, + from there. Remaining command-line args, + will overrule args found in the config, + file.""" + + argparser.add_argument("--credential-file", + dest="credential_file", + default=None, + help=("Location of the yaml file used to hold " + "your credentials.")) + + argparser.add_argument("--credential-file-key", + dest="credential_yaml_key", + default=None, + help=("the key in the credential file used " + "for this session's credentials. " + "Defaults to search_tweets_api")) + + argparser.add_argument("--env-overwrite", + dest="env_overwrite", + default=True, + help=("""Overwrite YAML-parsed credentials with + any set environment variables. See API docs or + readme for details.""")) + + argparser.add_argument("--config-file", + dest="config_filename", + default=None, + help=help_msg) + + argparser.add_argument("--query", + dest="query", + default=None, + help="Search query. (See: https://developer.twitter.com/en/docs/labs/recent-search/guides/search-queries)") + + argparser.add_argument("--interval", + dest="interval", + default=5, + help="""Polling interval in minutes. (default: 5 minutes)""") + + + argparser.add_argument("--start-time", + dest="start_time", + default=None, + help="""Start of datetime window, format + 'YYYY-mm-DDTHH:MM' (default: -7 days)""") + + argparser.add_argument("--end-time", + dest="end_time", + default=None, + help="""End of datetime window, format + 'YYYY-mm-DDTHH:MM' (default: most recent + date)""") + + argparser.add_argument("--since-id", + dest="since_id", + default=None, + help="Tweet ID, will start search from Tweets after this one. (See: https://developer.twitter.com/en/docs/labs/recent-search/guides/pagination)") + + argparser.add_argument("--until-id", + dest="until_id", + default=None, + help="Tweet ID, will end search from Tweets before this one. (See: https://developer.twitter.com/en/docs/labs/recent-search/guides/pagination)") + + argparser.add_argument("--results-per-call", + dest="results_per_call", + help="Number of results to return per call " + "(default 10; max 100) - corresponds to " + "'max_results' in the API") + + argparser.add_argument("--tweet-fields", + dest="tweet_fields", + default=None, + help="""A comma-delimited list of Tweet JSON attributions to include in endpoint responses. (Endpoint default: "id,created_at,text")""") + + #client options. + argparser.add_argument("--max-tweets", dest="max_tweets", + type=int, + help="Maximum number of Tweets to return for this session of requests.") + + argparser.add_argument("--max-pages", + dest="max_pages", + type=int, + default=None, + help="Maximum number of pages/API calls to " + "use for this session.") + + argparser.add_argument("--results-per-file", dest="results_per_file", + default=None, + type=int, + help="Maximum tweets to save per file.") + + argparser.add_argument("--filename-prefix", + dest="filename_prefix", + default=None, + help="prefix for the filename where tweet " + " json data will be stored.") + + argparser.add_argument("--no-print-stream", + dest="print_stream", + action="store_false", + help="disable print streaming") + + argparser.add_argument("--print-stream", + dest="print_stream", + action="store_true", + default=True, + help="Print tweet stream to stdout") + + argparser.add_argument("--extra-headers", + dest="extra_headers", + type=str, + default=None, + help="JSON-formatted str representing a dict of additional HTTP request headers") + + argparser.add_argument("--debug", + dest="debug", + action="store_true", + default=False, + help="print all info and warning messages") + return argparser + + +def _filter_sensitive_args(dict_): + sens_args = ("consumer_key", "consumer_secret", "bearer_token") + return {k: v for k, v in dict_.items() if k not in sens_args} + +def main(): + args_dict = vars(parse_cmd_args().parse_args()) + if args_dict.get("debug") is True: + logger.setLevel(logging.DEBUG) + logger.debug("command line args dict:") + logger.debug(json.dumps(args_dict, indent=4)) + + if args_dict.get("config_filename") is not None: + configfile_dict = read_config(args_dict["config_filename"]) + else: + configfile_dict = {} + + extra_headers_str = args_dict.get("extra_headers") + if extra_headers_str is not None: + args_dict['extra_headers_dict'] = json.loads(extra_headers_str) + del args_dict['extra_headers'] + + logger.debug("config file ({}) arguments sans sensitive args:".format(args_dict["config_filename"])) + logger.debug(json.dumps(_filter_sensitive_args(configfile_dict), indent=4)) + + creds_dict = load_credentials(filename=args_dict["credential_file"], + yaml_key=args_dict["credential_yaml_key"], + env_overwrite=args_dict["env_overwrite"]) + + dict_filter = lambda x: {k: v for k, v in x.items() if v is not None} + + config_dict = merge_dicts(dict_filter(configfile_dict), + dict_filter(creds_dict), + dict_filter(args_dict)) + + logger.debug("combined dict (cli, config, creds):") + logger.debug(json.dumps(_filter_sensitive_args(config_dict), indent=4)) + + if len(dict_filter(config_dict).keys() & REQUIRED_KEYS) < len(REQUIRED_KEYS): + print(REQUIRED_KEYS - dict_filter(config_dict).keys()) + logger.error("ERROR: not enough arguments for the script to work") + sys.exit(1) + + stream_params = gen_params_from_config(config_dict) + logger.debug("full arguments passed to the ResultStream object sans credentials") + logger.debug(json.dumps(_filter_sensitive_args(stream_params), indent=4)) + + while True: + + start = time.time() + rs = ResultStream(tweetify=False, **stream_params) + + logger.debug(str(rs)) + + if config_dict.get("filename_prefix") is not None: + stream = write_result_stream(rs, + filename_prefix=config_dict.get("filename_prefix"), + results_per_file=config_dict.get("results_per_file")) + else: + stream = rs.stream() + + first_tweet = True + tweets_num = 0 + + #Iterate through Tweet array and handle output. + for tweet in stream: + tweets_num = tweets_num + 1 + #Get Tweet ID from first Tweet + if first_tweet: + newest_id = tweet['id'] + first_tweet = False + if config_dict["print_stream"] is True: + print(json.dumps(tweet)) + + #This polling script switches to a since_id requests and removes the start_time parameter if it is used for backfill. + #Prepare next query, by setting the since_id request parameter. + print(f"{tweets_num} new Tweets. Newest_id: {newest_id}") + + request_json = json.loads(stream_params['request_parameters']) + + if 'start_time' in request_json.keys(): + del request_json['start_time'] + + request_json.update(since_id = newest_id) + stream_params['request_parameters'] = json.dumps(request_json) + + duration = time.time() - start + + sleep_interval = (float(config_dict["interval"]) * 60) - duration + + if sleep_interval < 0: + sleep_interval = (float(config_dict["interval"]) * 60) + + time.sleep(sleep_interval) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/search_tweets.py b/scripts/search_tweets.py new file mode 100644 index 0000000..356c449 --- /dev/null +++ b/scripts/search_tweets.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python +# Copyright 2020 Twitter, Inc. +# Licensed under the Apache License, Version 2.0 +# http://www.apache.org/licenses/LICENSE-2.0 +import os +import argparse +import json +import sys +import logging +from searchtweets import (ResultStream, + load_credentials, + merge_dicts, + read_config, + write_result_stream, + gen_params_from_config) + +logger = logging.getLogger() +# we want to leave this here and have it command-line configurable via the +# --debug flag +logging.basicConfig(level=os.environ.get("LOGLEVEL", "ERROR")) + + +REQUIRED_KEYS = {"query", "endpoint"} + + +def parse_cmd_args(): + argparser = argparse.ArgumentParser() + help_msg = """configuration file with all parameters. Far, + easier to use than the command-line args version., + If a valid file is found, all args will be populated, + from there. Remaining command-line args, + will overrule args found in the config, + file.""" + + argparser.add_argument("--credential-file", + dest="credential_file", + default=None, + help=("Location of the yaml file used to hold " + "your credentials.")) + + argparser.add_argument("--credential-file-key", + dest="credential_yaml_key", + default=None, + help=("the key in the credential file used " + "for this session's credentials. " + "Defaults to search_tweets_api")) + + argparser.add_argument("--env-overwrite", + dest="env_overwrite", + default=True, + help=("""Overwrite YAML-parsed credentials with + any set environment variables. See API docs or + readme for details.""")) + + argparser.add_argument("--config-file", + dest="config_filename", + default=None, + help=help_msg) + + argparser.add_argument("--query", + dest="query", + default=None, + help="Search query. (See: https://developer.twitter.com/en/docs/labs/recent-search/guides/search-queries)") + + argparser.add_argument("--start-time", + dest="start_time", + default=None, + help="""Start of datetime window, format + 'YYYY-mm-DDTHH:MM' (default: -7 days)""") + + argparser.add_argument("--end-time", + dest="end_time", + default=None, + help="""End of datetime window, format + 'YYYY-mm-DDTHH:MM' (default: most recent + date)""") + + argparser.add_argument("--since-id", + dest="since_id", + default=None, + help="Tweet ID, will start search from Tweets after this one. (See: https://developer.twitter.com/en/docs/labs/recent-search/guides/pagination)") + + argparser.add_argument("--until-id", + dest="until_id", + default=None, + help="Tweet ID, will end search from Tweets before this one. (See: https://developer.twitter.com/en/docs/labs/recent-search/guides/pagination)") + + argparser.add_argument("--results-per-call", + dest="results_per_call", + help="Number of results to return per call " + "(default 10; max 100) - corresponds to " + "'max_results' in the API") + + argparser.add_argument("--tweet-fields", + dest="tweet_fields", + default=None, + help="""A comma-delimited list of Tweet JSON attributions to include in endpoint responses. (default: "id,created_at,text")""") + + #client options. + argparser.add_argument("--max-tweets", dest="max_tweets", + type=int, + help="Maximum number of Tweets to return for this session of requests.") + + argparser.add_argument("--max-pages", + dest="max_pages", + type=int, + default=None, + help="Maximum number of pages/API calls to " + "use for this session.") + + argparser.add_argument("--results-per-file", dest="results_per_file", + default=None, + type=int, + help="Maximum tweets to save per file.") + + argparser.add_argument("--filename-prefix", + dest="filename_prefix", + default=None, + help="prefix for the filename where tweet " + " json data will be stored.") + + argparser.add_argument("--no-print-stream", + dest="print_stream", + action="store_false", + help="disable print streaming") + + argparser.add_argument("--print-stream", + dest="print_stream", + action="store_true", + default=True, + help="Print tweet stream to stdout") + + argparser.add_argument("--extra-headers", + dest="extra_headers", + type=str, + default=None, + help="JSON-formatted str representing a dict of additional HTTP request headers") + + argparser.add_argument("--debug", + dest="debug", + action="store_true", + default=False, + help="print all info and warning messages") + return argparser + + +def _filter_sensitive_args(dict_): + sens_args = ("consumer_key", "consumer_secret", "bearer_token") + return {k: v for k, v in dict_.items() if k not in sens_args} + +def main(): + args_dict = vars(parse_cmd_args().parse_args()) + if args_dict.get("debug") is True: + logger.setLevel(logging.DEBUG) + logger.debug("command line args dict:") + logger.debug(json.dumps(args_dict, indent=4)) + + if args_dict.get("config_filename") is not None: + configfile_dict = read_config(args_dict["config_filename"]) + else: + configfile_dict = {} + + extra_headers_str = args_dict.get("extra_headers") + if extra_headers_str is not None: + args_dict['extra_headers_dict'] = json.loads(extra_headers_str) + del args_dict['extra_headers'] + + logger.debug("config file ({}) arguments sans sensitive args:".format(args_dict["config_filename"])) + logger.debug(json.dumps(_filter_sensitive_args(configfile_dict), indent=4)) + + creds_dict = load_credentials(filename=args_dict["credential_file"], + yaml_key=args_dict["credential_yaml_key"], + env_overwrite=args_dict["env_overwrite"]) + + dict_filter = lambda x: {k: v for k, v in x.items() if v is not None} + + config_dict = merge_dicts(dict_filter(configfile_dict), + dict_filter(creds_dict), + dict_filter(args_dict)) + + logger.debug("combined dict (cli, config, creds):") + logger.debug(json.dumps(_filter_sensitive_args(config_dict), indent=4)) + + if len(dict_filter(config_dict).keys() & REQUIRED_KEYS) < len(REQUIRED_KEYS): + print(REQUIRED_KEYS - dict_filter(config_dict).keys()) + logger.error("ERROR: not enough arguments for the script to work") + sys.exit(1) + + stream_params = gen_params_from_config(config_dict) + logger.debug("full arguments passed to the ResultStream object sans credentials") + logger.debug(json.dumps(_filter_sensitive_args(stream_params), indent=4)) + + rs = ResultStream(tweetify=False, **stream_params) + + logger.debug(str(rs)) + + if config_dict.get("filename_prefix") is not None: + stream = write_result_stream(rs, + filename_prefix=config_dict.get("filename_prefix"), + results_per_file=config_dict.get("results_per_file")) + else: + stream = rs.stream() + + for tweet in stream: + if config_dict["print_stream"] is True: + print(json.dumps(tweet)) + +if __name__ == '__main__': + main() \ No newline at end of file From e78e04de230d0df05a9119fcdd1ba6826336f4b0 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Tue, 12 May 2020 20:17:09 -0600 Subject: [PATCH 04/83] JM: Labs updates. --- LICENSE | 0 README.rst | 3 +++ setup.py | 10 +++++----- 3 files changed, 8 insertions(+), 5 deletions(-) mode change 100644 => 100755 LICENSE diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.rst b/README.rst index 2bd7022..0697a00 100644 --- a/README.rst +++ b/README.rst @@ -1,3 +1,5 @@ +[![Labs v2](https://img.shields.io/static/v1?label=Twitter%20API&message=Developer%20Labs%20v2&color=794BC4&style=flat&logo=Twitter)](https://developer.twitter.com/en/docs/labs/overview/versioning) + Python Twitter Search API ========================= @@ -886,3 +888,4 @@ this) and commit the result: .. code:: bash bash make_readme.sh + diff --git a/setup.py b/setup.py index 5831758..bcf62d2 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 2018 Twitter, Inc. +# Copyright 2020 Twitter, Inc. # Licensed under the MIT License # https://opensource.org/licenses/MIT import re @@ -23,15 +23,15 @@ def parse_version(str_): VERSION = parse_version(_version_line) setup(name='searchtweets', - description="Wrapper for Twitter's Premium and Enterprise search APIs", + description="Wrapper for Twitter Developer Labs Recent search endpoint.", url='https://github.com/twitterdev/search-tweets-python', - author='Fiona Pigott, Jeff Kolb, Josh Montague, Aaron Gonzales', + author='Fiona Pigott, Jeff Kolb, Josh Montague, Aaron Gonzales, Jim Moffitt', long_description=open('README.rst', 'r', encoding="utf-8").read(), - author_email='agonzales@twitter.com', + author_email='dev-support@twitter.com', license='MIT', version=VERSION, python_requires='>=3.3', install_requires=["requests", "tweet_parser", "pyyaml"], packages=find_packages(), - scripts=["tools/search_tweets.py"], + scripts=["tools/search_tweets.py","tools/polling_app.py"], ) From 60e1539e4b9747e013aec54af4665b3fe4319ea5 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 12 May 2020 20:27:21 -0600 Subject: [PATCH 05/83] Update README.rst --- README.rst | 1066 ++++++++++++---------------------------------------- 1 file changed, 247 insertions(+), 819 deletions(-) diff --git a/README.rst b/README.rst index 0697a00..df86d0b 100644 --- a/README.rst +++ b/README.rst @@ -1,891 +1,319 @@ -[![Labs v2](https://img.shields.io/static/v1?label=Twitter%20API&message=Developer%20Labs%20v2&color=794BC4&style=flat&logo=Twitter)](https://developer.twitter.com/en/docs/labs/overview/versioning) +Python client for Labs Recent search +==================================== -Python Twitter Search API -========================= +This project serves as a wrapper for the `Twitter Labs recent search +APIs `__, +providing a command-line utility and a Python library. -This project serves as a wrapper for the `Twitter premium and enterprise -search -APIs `__, -providing a command-line utility and a Python library. Pretty docs can -be seen `here `__. +This is a fork of the premium/enterprise search client at https://github.com/twitterdev/search-tweets-python. -Features -======== - -- Supports 30-day Search and Full Archive Search (not the standard - Search API at this time). -- Command-line utility is pipeable to other tools (e.g., ``jq``). -- Automatically handles pagination of search results with specifiable - limits -- Delivers a stream of data to the user for low in-memory requirements -- Handles enterprise and premium authentication methods -- Flexible usage within a python program -- Compatible with our group's `Tweet - Parser `__ for rapid - extraction of relevant data fields from each tweet payload -- Supports the Search Counts endpoint, which can reduce API call usage - and provide rapid insights if you only need Tweet volumes and not - Tweet payloads - -Installation -============ +If you are working with an enterprise or premium 30-day or Full-archive search endpoint, the ```master``` branch of this repository has what you need. -The ``searchtweets`` library is on Pypi: -.. code:: bash +Features +======== - pip install searchtweets +- Supports Labs Recent search, v2. +- Supports a new "polling" mode using the new Labs ```since_id``` search request parameter. The ```since_id```, along with the new ```until_id``` provide a way to navigate the public Tweet archive by Tweet ID. +- Supports additional ways to specify ```start_time``` and ```end_time``` request parameters: -Or you can install the development version locally via + - d# - For example, 'd2' sets ```start_time``` to (exactly) two days ago. + - h# - For example, 'h12' sets ```start_time``` to (exactly) twelve hours ago. + - m# - For example, 'm15' sets ```start_time``` to (exactly) fifteen minutes ago. + + These are handy for kicking off searches with a backfill period, and also work with the ```end_time``` request parameter. -.. code:: bash +These features were inherited from the enterprise/premium version. +- Command-line utility is pipeable to other tools (e.g., ``jq``). +- Automatically handles pagination of search results with specifiable + limits. +- Delivers a stream of data to the user for low in-memory requirements. +- Handles OAuth 2 and Bearer Token authentication. +- Flexible usage within a python program. - git clone https://github.com/twitterdev/search-tweets-python - cd search-tweets-python - pip install -e . --------------- +Labs updates +============ -Credential Handling -=================== - -The premium and enterprise Search APIs use different authentication -methods and we attempt to provide a seamless way to handle -authentication for all customers. We know credentials can be tricky or -annoying - please read this in its entirety. - -Premium clients will require the ``bearer_token`` and ``endpoint`` -fields; Enterprise clients require ``username``, ``password``, and -``endpoint``. If you do not specify the ``account_type``, we attempt to -discern the account type and declare a warning about this behavior. - -For premium search products, we are using app-only authentication and -the bearer tokens are not delivered with an expiration time. You can -provide either: - your application key and secret (the library will -handle bearer-token authentication) - a bearer token that you get -yourself - -Many developers might find providing your application key and secret -more straightforward and letting this library manage your bearer token -generation for you. Please see -`here `__ -for an overview of the premium authentication method. - -We support both YAML-file based methods and environment variables for -storing credentials, and provide flexible handling with sensible -defaults. +When migrating this Python search client from an enterprise or premium search endpoint, the following updates were made: + +- Added support for GET requests (and removed POST support for now) +- Added support for since_id and until_id request parameters. +- Updated pagination details. +- Updated app command-line parlance + - --start-datetime → --start-time + - --end-datetime → --end-time + - --filter-rule → --query + - --max-results → --max-tweets + - Dropped --account-type. + - Dropped --count-bucket. Removed search 'counts' endpoint support. This endpoint is currently not available in Labs. + + +Command-line options +===================== + +usage: search_tweets.py + +optional arguments: + -h, --help show this help message and exit + --credential-file CREDENTIAL_FILE + Location of the yaml file used to hold your + credentials. + --credential-file-key CREDENTIAL_YAML_KEY + the key in the credential file used for this session's + credentials. Defaults to search_tweets_api + --env-overwrite ENV_OVERWRITE + Overwrite YAML-parsed credentials with any set + environment variables. See API docs or readme for + details. + --config-file CONFIG_FILENAME + configuration file with all parameters. Far, easier to + use than the command-line args version., If a valid + file is found, all args will be populated, from there. + Remaining command-line args, will overrule args found + in the config, file. + --start-time START_TIME + Start of datetime window, format 'YYYY-mm-DDTHH:MM' + (default: -7 days) + --end-time END_TIME End of datetime window, format 'YYYY-mm-DDTHH:MM' + (default: most recent date) + --query QUERY Search query. (See: + https://developer.twitter.com/en/docs/labs/recent- + search/guides/search-queries) + --since-id SINCE_ID Tweet ID, will start search from Tweets after this + one. (See: + https://developer.twitter.com/en/docs/labs/recent- + search/guides/pagination) + --until-id UNTIL_ID Tweet ID, will end search from Tweets before this one. + (See: + https://developer.twitter.com/en/docs/labs/recent- + search/guides/pagination) + --results-per-call RESULTS_PER_CALL + Number of results to return per call (default 10; max + 100) - corresponds to 'max_results' in the API + --max-tweets MAX_TWEETS + Maximum number of Tweets to return for this session of + requests. + --max-pages MAX_PAGES + Maximum number of pages/API calls to use for this + session. + --results-per-file RESULTS_PER_FILE + Maximum tweets to save per file. + --filename-prefix FILENAME_PREFIX + prefix for the filename where tweet json data will be + stored. + --no-print-stream disable print streaming + --print-stream Print tweet stream to stdout + --extra-headers EXTRA_HEADERS + JSON-formatted str representing a dict of additional + request headers + --debug print all info and warning messages + + +Migrating from enterprise/premium library +========================================= -YAML method ------------ -For premium customers, the simplest credential file should look like -this: -.. code:: yaml - search_tweets_api: - account_type: premium - endpoint: - consumer_key: - consumer_secret: -For enterprise customers, the simplest credential file should look like -this: -.. code:: yaml - search_tweets_api: - account_type: enterprise - endpoint: - username: - password: -By default, this library expects this file at -``"~/.twitter_keys.yaml"``, but you can pass the relevant location as -needed, either with the ``--credential-file`` flag for the command-line -app or as demonstrated below in a Python program. -Both above examples require no special command-line arguments or -in-program arguments. The credential parsing methods, unless otherwise -specified, will look for a YAML key called ``search_tweets_api``. -For developers who have multiple endpoints and/or search products, you -can keep all credentials in the same file and specify specific keys to -use. ``--credential-file-key`` specifies this behavior in the command -line app. An example: +Installation +============= -.. code:: yaml +{Are there any new conventions?} +Maintaing two packages: ++ searchtweets (current enterprise/premium package) ++ searchtweetslabs +Eventually, there will be searchtweetsv2, and searchtweets will be dropped. - search_tweets_30_day_dev: - account_type: premium - endpoint: - consumer_key: - consumer_secret: - (optional) bearer_token: +The searchtweets library is on Pypi: +pip install searchtweets +Or you can install the development version locally via - search_tweets_30_day_prod: - account_type: premium - endpoint: - bearer_token: +git clone https://github.com/twitterdev/search-tweets-python +cd search-tweets-python +pip install -e . +Credential Handling +The premium and enterprise Search APIs use different authentication methods and we attempt to provide a seamless way to handle authentication for all customers. We know credentials can be tricky or annoying - please read this in its entirety. - search_tweets_fullarchive_dev: - account_type: premium - endpoint: - bearer_token: +Premium clients will require the bearer_token and endpoint fields; Enterprise clients require username, password, and endpoint. If you do not specify the account_type, we attempt to discern the account type and declare a warning about this behavior. - search_tweets_fullarchive_prod: - account_type: premium - endpoint: - bearer_token: +For premium search products, we are using app-only authentication and the bearer tokens are not delivered with an expiration time. You can provide either: - your application key and secret (the library will handle bearer-token authentication) - a bearer token that you get yourself -Environment Variables ---------------------- +Many developers might find providing your application key and secret more straightforward and letting this library manage your bearer token generation for you. Please see here for an overview of the premium authentication method. -If you want or need to pass credentials via environment variables, you -can set the appropriate variables for your product of the following: +We support both YAML-file based methods and environment variables for storing credentials, and provide flexible handling with sensible defaults. -:: +YAML method +For premium customers, the simplest credential file should look like this: - export SEARCHTWEETS_ENDPOINT= - export SEARCHTWEETS_USERNAME= - export SEARCHTWEETS_PASSWORD= - export SEARCHTWEETS_BEARER_TOKEN= - export SEARCHTWEETS_ACCOUNT_TYPE= - export SEARCHTWEETS_CONSUMER_KEY= - export SEARCHTWEETS_CONSUMER_SECRET= +search_tweets_endpoint: + endpoint: + consumer_key: + consumer_secret: -The ``load_credentials`` function will attempt to find these variables -if it cannot load fields from the YAML file, and it will **overwrite any -credentials from the YAML file that are present as environment -variables** if they have been parsed. This behavior can be changed by -setting the ``load_credentials`` parameter ``env_overwrite`` to -``False``. +By default, this library expects this file at "~/.twitter_keys.yaml", but you can pass the relevant location as needed, either with the --credential-file flag for the command-line app or as demonstrated below in a Python program. -The following cells demonstrates credential handling in the Python -library. +Both above examples require no special command-line arguments or in-program arguments. The credential parsing methods, unless otherwise specified, will look for a YAML key called search_tweets_api. -.. code:: python +For developers who have multiple endpoints and/or search products, you can keep all credentials in the same file and specify specific keys to use. --credential-file-key specifies this behavior in the command line app. An example: - from searchtweets import load_credentials +search_tweets_labsv1: + endpoint: + consumer_key: + consumer_secret: + (optional) bearer_token: -.. code:: python +search_tweets_labsv2: + endpoint: + consumer_key: + consumer_secret: + (optional) bearer_token: - load_credentials(filename="./search_tweets_creds_example.yaml", - yaml_key="search_tweets_ent_example", - env_overwrite=False) -:: +Environment Variables - {'username': '', - 'password': '', - 'endpoint': ''} +If you want or need to pass credentials via environment variables, you can set the appropriate variables for your product of the following: -.. code:: python +export SEARCHTWEETS_ENDPOINT= +export SEARCHTWEETS_BEARER_TOKEN= +export SEARCHTWEETS_CONSUMER_KEY= +export SEARCHTWEETS_CONSUMER_SECRET= - load_credentials(filename="./search_tweets_creds_example.yaml", - yaml_key="search_tweets_premium_example", - env_overwrite=False) +The load_credentials function will attempt to find these variables if it cannot load fields from the YAML file, and it will overwrite any credentials from the YAML file that are present as environment variables if they have been parsed. This behavior can be changed by setting the load_credentials parameter env_overwrite to False. -:: +The following cells demonstrates credential handling in the Python library. - {'bearer_token': '', - 'endpoint': 'https://api.twitter.com/1.1/tweets/search/30day/dev.json', - 'extra_headers_dict': None} +from searchtweets import load_credentials +load_credentials(filename="./search_tweets_creds_example.yaml", + yaml_key="search_tweets_ent_example", + env_overwrite=False) +{ 'endpoint': ''} +load_credentials(filename="./search_tweets_creds_example.yaml", + yaml_key="search_tweetsv2_example", + env_overwrite=False) + +{'bearer_token': '', + 'endpoint': 'https://api.twitter.com/labs/2/tweets/search', + 'extra_headers_dict': None} + + Environment Variable Overrides ------------------------------- - -If we set our environment variables, the program will look for them -regardless of a YAML file's validity or existence. - -.. code:: python - import os - os.environ["SEARCHTWEETS_USERNAME"] = "" - os.environ["SEARCHTWEETS_PASSWORD"] = "" - os.environ["SEARCHTWEETS_ENDPOINT"] = "" +If we set our environment variables, the program will look for them regardless of a YAML file's validity or existence. - load_credentials(filename="nothing_here.yaml", yaml_key="no_key_here") +import os +os.environ["SEARCHTWEETS_USERNAME"] = "" +os.environ["SEARCHTWEETS_BEARERTOKEN"] = "" +os.environ["SEARCHTWEETS_ENDPOINT"] = "" -:: +load_credentials(filename="nothing_here.yaml", yaml_key="no_key_here") +cannot read file nothing_here.yaml - cannot read file nothing_here.yaml - Error parsing YAML file; searching for valid environment variables - -:: - - {'username': '', - 'password': '', - 'endpoint': ''} +Error parsing YAML file; searching for valid environment variables +{'bearer_token': '', + 'endpoint': ''} Command-line app ----------------- the flags: -- ``--credential-file `` -- ``--credential-file-key `` -- ``--env-overwrite`` - +--credential-file +--credential-file-key +--env-overwrite are used to control credential behavior from the command-line app. --------------- - Using the Comand Line Application -================================= - -The library includes an application, ``search_tweets.py``, that provides -rapid access to Tweets. When you use ``pip`` to install this package, -``search_tweets.py`` is installed globally. The file is located in the -``tools/`` directory for those who want to run it locally. - -Note that the ``--results-per-call`` flag specifies an argument to the -API ( ``maxResults``, results returned per CALL), not as a hard max to -number of results returned from this program. The argument -``--max-results`` defines the maximum number of results to return from a -given call. All examples assume that your credentials are set up -correctly in the default location - ``.twitter_keys.yaml`` or in -environment variables. - -**Stream json results to stdout without saving** - -.. code:: bash - - search_tweets.py \ - --max-results 1000 \ - --results-per-call 100 \ - --filter-rule "beyonce has:hashtags" \ - --print-stream - -**Stream json results to stdout and save to a file** - -.. code:: bash - - search_tweets.py \ - --max-results 1000 \ - --results-per-call 100 \ - --filter-rule "beyonce has:hashtags" \ - --filename-prefix beyonce_geo \ - --print-stream - -**Save to file without output** - -.. code:: bash - - search_tweets.py \ - --max-results 100 \ - --results-per-call 100 \ - --filter-rule "beyonce has:hashtags" \ - --filename-prefix beyonce_geo \ - --no-print-stream - -One or more custom headers can be specified from the command line, using -the ``--extra-headers`` argument and a JSON-formatted string -representing a dictionary of extra headers: - -.. code:: bash - - search_tweets.py \ - --filter-rule "beyonce has:hashtags" \ - --extra-headers '{"":""}' - -Options can be passed via a configuration file (either ini or YAML). -Example files can be found in the ``tools/api_config_example.config`` or -``./tools/api_yaml_example.yaml`` files, which might look like this: - -.. code:: bash - - [search_rules] - from_date = 2017-06-01 - to_date = 2017-09-01 - pt_rule = beyonce has:geo - - [search_params] - results_per_call = 500 - max_results = 500 - - [output_params] - save_file = True - filename_prefix = beyonce - results_per_file = 10000000 +The library includes an application, search_tweets.py, that provides rapid access to Tweets. When you use pip to install this package, search_tweets.py is installed globally. The file is located in the tools/ directory for those who want to run it locally. + +Note that the --results-per-call flag specifies an argument to the API ( maxResults, results returned per CALL), not as a hard max to number of results returned from this program. The argument --max-results defines the maximum number of results to return from a given call. All examples assume that your credentials are set up correctly in the default location - .twitter_keys.yaml or in environment variables. + +Stream json results to stdout without saving + +search_tweets.py \ + --max-results 1000 \ + --results-per-call 100 \ + --query "(snow OR rain) has:media -is:retweet" \ + --print-stream +Stream json results to stdout and save to a file + +search_tweets.py \ + --max-results 1000 \ + --results-per-call 100 \ + --query "(snow OR rain) has:media -is:retweet" \ + --filename-prefix beyonce_geo \ + --print-stream +Save to file without output + +search_tweets.py \ + --max-results 100 \ + --results-per-call 100 \ + --query "(snow OR rain) has:media -is:retweet" \ + --filename-prefix beyonce_geo \ + --no-print-stream +One or more custom headers can be specified from the command line, using the --extra-headers argument and a JSON-formatted string representing a dictionary of extra headers: + +search_tweets.py \ + --query "(snow OR rain) has:media -is:retweet" \ + --extra-headers '{"":""}' +Options can be passed via a configuration file (either ini or YAML). Example files can be found in the tools/api_config_example.config or ./tools/api_yaml_example.yaml files, which might look like this: + +[search_rules] +start_time = 2020-05-01 +end_time = 2020-05-01 +query = (snow OR rain) has:media -is:retweet + +[search_params] +results_per_call = 100 +max_tweets = 10000 + +[output_params] +save_file = True +filename_prefix = weather-pics +results_per_file = 10000000 Or this: -.. code:: yaml +search_rules: + start_time: 2017-06-01 + end_time: 2017-09-01 01:01 + query: (snow OR rain) has:media -is:retweet - search_rules: - from-date: 2017-06-01 - to-date: 2017-09-01 01:01 - pt-rule: kanye +search_params: + results-per-call: 100 + max-results: 500 - search_params: - results-per-call: 500 - max-results: 500 +output_params: + save_file: True + filename_prefix: (snow OR rain) has:media -is:retweet + results_per_file: 10000000 +Custom headers can be specified in a config file, under a specific credentials key: - output_params: - save_file: True - filename_prefix: kanye - results_per_file: 10000000 - -Custom headers can be specified in a config file, under a specific -credentials key: - -.. code:: yaml - - search_tweets_api: - account_type: premium - endpoint: - username: - password: - extra_headers: - : - -When using a config file in conjunction with the command-line utility, -you need to specify your config file via the ``--config-file`` -parameter. Additional command-line arguments will either be *added* to -the config file args or **overwrite** the config file args if both are -specified and present. +search_tweets_api: + endpoint: + bearer_token: + extra_headers: + : +When using a config file in conjunction with the command-line utility, you need to specify your config file via the --config-file parameter. Additional command-line arguments will either be added to the config file args or overwrite the config file args if both are specified and present. Example: -:: - - search_tweets.py \ - --config-file myapiconfig.config \ - --no-print-stream - --------------- - +search_tweets.py \ + --config-file myapiconfig.config \ + --no-print-stream Full options are listed below: -:: - - $ search_tweets.py -h - usage: search_tweets.py [-h] [--credential-file CREDENTIAL_FILE] - [--credential-file-key CREDENTIAL_YAML_KEY] - [--env-overwrite ENV_OVERWRITE] - [--config-file CONFIG_FILENAME] - [--account-type {premium,enterprise}] - [--count-bucket COUNT_BUCKET] - [--start-datetime FROM_DATE] [--end-datetime TO_DATE] - [--filter-rule PT_RULE] - [--results-per-call RESULTS_PER_CALL] - [--max-results MAX_RESULTS] [--max-pages MAX_PAGES] - [--results-per-file RESULTS_PER_FILE] - [--filename-prefix FILENAME_PREFIX] - [--no-print-stream] [--print-stream] - [--extra-headers EXTRA_HEADERS] [--debug] - - optional arguments: - -h, --help show this help message and exit - --credential-file CREDENTIAL_FILE - Location of the yaml file used to hold your - credentials. - --credential-file-key CREDENTIAL_YAML_KEY - the key in the credential file used for this session's - credentials. Defaults to search_tweets_api - --env-overwrite ENV_OVERWRITE - Overwrite YAML-parsed credentials with any set - environment variables. See API docs or readme for - details. - --config-file CONFIG_FILENAME - configuration file with all parameters. Far, easier to - use than the command-line args version., If a valid - file is found, all args will be populated, from there. - Remaining command-line args, will overrule args found - in the config, file. - --account-type {premium,enterprise} - The account type you are using - --count-bucket COUNT_BUCKET - Bucket size for counts API. Options:, day, hour, - minute (default is 'day'). - --start-datetime FROM_DATE - Start of datetime window, format 'YYYY-mm-DDTHH:MM' - (default: -30 days) - --end-datetime TO_DATE - End of datetime window, format 'YYYY-mm-DDTHH:MM' - (default: most recent date) - --filter-rule PT_RULE - PowerTrack filter rule (See: http://support.gnip.com/c - ustomer/portal/articles/901152-powertrack-operators) - --results-per-call RESULTS_PER_CALL - Number of results to return per call (default 100; max - 500) - corresponds to 'maxResults' in the API - --max-results MAX_RESULTS - Maximum number of Tweets or Counts to return for this - session (defaults to 500) - --max-pages MAX_PAGES - Maximum number of pages/API calls to use for this - session. - --results-per-file RESULTS_PER_FILE - Maximum tweets to save per file. - --filename-prefix FILENAME_PREFIX - prefix for the filename where tweet json data will be - stored. - --no-print-stream disable print streaming - --print-stream Print tweet stream to stdout - --extra-headers EXTRA_HEADERS - JSON-formatted str representing a dict of additional - request headers - --debug print all info and warning messages - --------------- - -Using the Twitter Search APIs' Python Wrapper -============================================= - -Working with the API within a Python program is straightforward both for -Premium and Enterprise clients. - -We'll assume that credentials are in the default location, -``~/.twitter_keys.yaml``. - -.. code:: python - - from searchtweets import ResultStream, gen_rule_payload, load_credentials - -Enterprise setup ----------------- - -.. code:: python - - enterprise_search_args = load_credentials("~/.twitter_keys.yaml", - yaml_key="search_tweets_enterprise", - env_overwrite=False) - -Premium Setup -------------- - -.. code:: python - - premium_search_args = load_credentials("~/.twitter_keys.yaml", - yaml_key="search_tweets_premium", - env_overwrite=False) - -There is a function that formats search API rules into valid json -queries called ``gen_rule_payload``. It has sensible defaults, such as -pulling more Tweets per call than the default 100 (but note that a -sandbox environment can only have a max of 100 here, so if you get -errors, please check this) not including dates, and defaulting to hourly -counts when using the counts api. Discussing the finer points of -generating search rules is out of scope for these examples; I encourage -you to see the docs to learn the nuances within, but for now let's see -what a rule looks like. - -.. code:: python - - rule = gen_rule_payload("beyonce", results_per_call=100) # testing with a sandbox account - print(rule) - -:: - - {"query":"beyonce","maxResults":100} - -This rule will match tweets that have the text ``beyonce`` in them. - -From this point, there are two ways to interact with the API. There is a -quick method to collect smaller amounts of Tweets to memory that -requires less thought and knowledge, and interaction with the -``ResultStream`` object which will be introduced later. - -Fast Way --------- - -We'll use the ``search_args`` variable to power the configuration point -for the API. The object also takes a valid PowerTrack rule and has -options to cutoff search when hitting limits on both number of Tweets -and API calls. - -We'll be using the ``collect_results`` function, which has three -parameters. - -- rule: a valid PowerTrack rule, referenced earlier -- max_results: as the API handles pagination, it will stop collecting - when we get to this number -- result_stream_args: configuration args that we've already specified. - -For the remaining examples, please change the args to either premium or -enterprise depending on your usage. - -Let's see how it goes: - -.. code:: python - - from searchtweets import collect_results - -.. code:: python - - tweets = collect_results(rule, - max_results=100, - result_stream_args=enterprise_search_args) # change this if you need to - -By default, Tweet payloads are lazily parsed into a ``Tweet`` -`object `__. An overwhelming -number of Tweet attributes are made available directly, as such: - -.. code:: python - - [print(tweet.all_text, end='\n\n') for tweet in tweets[0:10]]; - -:: - - Jay-Z & Beyoncé sat across from us at dinner tonight and, at one point, I made eye contact with Beyoncé. My limbs turned to jello and I can no longer form a coherent sentence. I have seen the eyes of the lord. - - Beyoncé and it isn't close. https://t.co/UdOU9oUtuW - - As you could guess.. Signs by Beyoncé will always be my shit. - - When Beyoncé adopts a dog 🙌🏾 https://t.co/U571HyLG4F - - Hold up, you can't just do that to Beyoncé - https://t.co/3p14DocGqA - - Why y'all keep using Rihanna and Beyoncé gifs to promote the show when y'all let Bey lose the same award she deserved 3 times and let Rihanna leave with nothing but the clothes on her back? https://t.co/w38QpH0wma - - 30) anybody tell you that you look like Beyoncé https://t.co/Vo4Z7bfSCi - - Mi Beyoncé favorita https://t.co/f9Jp600l2B - Beyoncé necesita ver esto. Que diosa @TiniStoessel 🔥🔥🔥 https://t.co/gadVJbehQZ - - Joanne Pearce Is now playing IF I WAS A BOY - BEYONCE.mp3 by ! - - I'm trynna see beyoncé's finsta before I die - -.. code:: python - - [print(tweet.created_at_datetime) for tweet in tweets[0:10]]; - -:: - - 2018-01-17 00:08:50 - 2018-01-17 00:08:49 - 2018-01-17 00:08:44 - 2018-01-17 00:08:42 - 2018-01-17 00:08:42 - 2018-01-17 00:08:42 - 2018-01-17 00:08:40 - 2018-01-17 00:08:38 - 2018-01-17 00:08:37 - 2018-01-17 00:08:37 - -.. code:: python - - [print(tweet.generator.get("name")) for tweet in tweets[0:10]]; - -:: - - Twitter for iPhone - Twitter for iPhone - Twitter for iPhone - Twitter for iPhone - Twitter for iPhone - Twitter for iPhone - Twitter for Android - Twitter for iPhone - Airtime Pro - Twitter for iPhone - -Voila, we have some Tweets. For interactive environments and other cases -where you don't care about collecting your data in a single load or -don't need to operate on the stream of Tweets or counts directly, I -recommend using this convenience function. - -Working with the ResultStream ------------------------------ - -The ResultStream object will be powered by the ``search_args``, and -takes the rules and other configuration parameters, including a hard -stop on number of pages to limit your API call usage. - -.. code:: python - - rs = ResultStream(rule_payload=rule, - max_results=500, - max_pages=1, - **premium_search_args) - - print(rs) - -:: - - ResultStream: - { - "username":null, - "endpoint":"https:\/\/fanyv88.com:443\/https\/api.twitter.com\/1.1\/tweets\/search\/30day\/dev.json", - "rule_payload":{ - "query":"beyonce", - "maxResults":100 - }, - "tweetify":true, - "max_results":500 - } - -There is a function, ``.stream``, that seamlessly handles requests and -pagination for a given query. It returns a generator, and to grab our -500 Tweets that mention ``beyonce`` we can do this: - -.. code:: python - - tweets = list(rs.stream()) - -Tweets are lazily parsed using our `Tweet -Parser `__, so tweet data is -very easily extractable. - -.. code:: python - - # using unidecode to prevent emoji/accents printing - [print(tweet.all_text) for tweet in tweets[0:10]]; - -:: - - gente socorro kkkkkkkkkk BEYONCE https://t.co/kJ9zubvKuf - Jay-Z & Beyoncé sat across from us at dinner tonight and, at one point, I made eye contact with Beyoncé. My limbs turned to jello and I can no longer form a coherent sentence. I have seen the eyes of the lord. - Beyoncé and it isn't close. https://t.co/UdOU9oUtuW - As you could guess.. Signs by Beyoncé will always be my shit. - When Beyoncé adopts a dog 🙌🏾 https://t.co/U571HyLG4F - Hold up, you can't just do that to Beyoncé - https://t.co/3p14DocGqA - Why y'all keep using Rihanna and Beyoncé gifs to promote the show when y'all let Bey lose the same award she deserved 3 times and let Rihanna leave with nothing but the clothes on her back? https://t.co/w38QpH0wma - 30) anybody tell you that you look like Beyoncé https://t.co/Vo4Z7bfSCi - Mi Beyoncé favorita https://t.co/f9Jp600l2B - Beyoncé necesita ver esto. Que diosa @TiniStoessel 🔥🔥🔥 https://t.co/gadVJbehQZ - Joanne Pearce Is now playing IF I WAS A BOY - BEYONCE.mp3 by ! - -Counts Endpoint ---------------- - -We can also use the Search API Counts endpoint to get counts of Tweets -that match our rule. Each request will return up to *30* results, and -each count request can be done on a minutely, hourly, or daily basis. -The underlying ``ResultStream`` object will handle converting your -endpoint to the count endpoint, and you have to specify the -``count_bucket`` argument when making a rule to use it. - -The process is very similar to grabbing Tweets, but has some minor -differences. - -*Caveat - premium sandbox environments do NOT have access to the Search -API counts endpoint.* - -.. code:: python - - count_rule = gen_rule_payload("beyonce", count_bucket="day") - - counts = collect_results(count_rule, result_stream_args=enterprise_search_args) - -Our results are pretty straightforward and can be rapidly used. - -.. code:: python - - counts - -:: - - [{'count': 366, 'timePeriod': '201801170000'}, - {'count': 44580, 'timePeriod': '201801160000'}, - {'count': 61932, 'timePeriod': '201801150000'}, - {'count': 59678, 'timePeriod': '201801140000'}, - {'count': 44014, 'timePeriod': '201801130000'}, - {'count': 46607, 'timePeriod': '201801120000'}, - {'count': 41523, 'timePeriod': '201801110000'}, - {'count': 47056, 'timePeriod': '201801100000'}, - {'count': 65506, 'timePeriod': '201801090000'}, - {'count': 95251, 'timePeriod': '201801080000'}, - {'count': 162883, 'timePeriod': '201801070000'}, - {'count': 106344, 'timePeriod': '201801060000'}, - {'count': 93542, 'timePeriod': '201801050000'}, - {'count': 110415, 'timePeriod': '201801040000'}, - {'count': 127523, 'timePeriod': '201801030000'}, - {'count': 131952, 'timePeriod': '201801020000'}, - {'count': 176157, 'timePeriod': '201801010000'}, - {'count': 57229, 'timePeriod': '201712310000'}, - {'count': 72277, 'timePeriod': '201712300000'}, - {'count': 72051, 'timePeriod': '201712290000'}, - {'count': 76371, 'timePeriod': '201712280000'}, - {'count': 61578, 'timePeriod': '201712270000'}, - {'count': 55118, 'timePeriod': '201712260000'}, - {'count': 59115, 'timePeriod': '201712250000'}, - {'count': 106219, 'timePeriod': '201712240000'}, - {'count': 114732, 'timePeriod': '201712230000'}, - {'count': 73327, 'timePeriod': '201712220000'}, - {'count': 89171, 'timePeriod': '201712210000'}, - {'count': 192381, 'timePeriod': '201712200000'}, - {'count': 85554, 'timePeriod': '201712190000'}, - {'count': 57829, 'timePeriod': '201712180000'}] - -Dated searches / Full Archive Search ------------------------------------- - -**Note that this will only work with the full archive search option**, -which is available to my account only via the enterprise options. Full -archive search will likely require a different endpoint or access -method; please see your developer console for details. - -Let's make a new rule and pass it dates this time. - -``gen_rule_payload`` takes timestamps of the following forms: - -- ``YYYYmmDDHHMM`` -- ``YYYY-mm-DD`` (which will convert to midnight UTC (00:00) -- ``YYYY-mm-DD HH:MM`` -- ``YYYY-mm-DDTHH:MM`` - -Note - all Tweets are stored in UTC time. - -.. code:: python - - rule = gen_rule_payload("from:jack", - from_date="2017-09-01", #UTC 2017-09-01 00:00 - to_date="2017-10-30",#UTC 2017-10-30 00:00 - results_per_call=500) - print(rule) - -:: - - {"query":"from:jack","maxResults":500,"toDate":"201710300000","fromDate":"201709010000"} - -.. code:: python - - tweets = collect_results(rule, max_results=500, result_stream_args=enterprise_search_args) - -.. code:: python - - [print(tweet.all_text) for tweet in tweets[0:10]]; - -:: - - More clarity on our private information policy and enforcement. Working to build as much direct context into the product too https://t.co/IrwBexPrBA - To provide more clarity on our private information policy, we’ve added specific examples of what is/is not a violation and insight into what we need to remove this type of content from the service. https://t.co/NGx5hh2tTQ - Launching violent groups and hateful images/symbols policy on November 22nd https://t.co/NaWuBPxyO5 - We will now launch our policies on violent groups and hateful imagery and hate symbols on Nov 22. During the development process, we received valuable feedback that we’re implementing before these are published and enforced. See more on our policy development process here 👇 https://t.co/wx3EeH39BI - @WillStick @lizkelley Happy birthday Liz! - Off-boarding advertising from all accounts owned by Russia Today (RT) and Sputnik. - - We’re donating all projected earnings ($1.9mm) to support external research into the use of Twitter in elections, including use of malicious automation and misinformation. https://t.co/zIxfqqXCZr - @TMFJMo @anthonynoto Thank you - @gasca @stratechery @Lefsetz letter - @gasca @stratechery Bridgewater’s Daily Observations - Yup!!!! ❤️❤️❤️❤️ #davechappelle https://t.co/ybSGNrQpYF - @ndimichino Sometimes - Setting up at @CampFlogGnaw https://t.co/nVq8QjkKsf - -.. code:: python - - rule = gen_rule_payload("from:jack", - from_date="2017-09-20", - to_date="2017-10-30", - count_bucket="day", - results_per_call=500) - print(rule) - -:: - - {"query":"from:jack","toDate":"201710300000","fromDate":"201709200000","bucket":"day"} - -.. code:: python - - counts = collect_results(rule, max_results=500, result_stream_args=enterprise_search_args) - -.. code:: python - - [print(c) for c in counts]; - -:: - - {'timePeriod': '201710290000', 'count': 0} - {'timePeriod': '201710280000', 'count': 0} - {'timePeriod': '201710270000', 'count': 3} - {'timePeriod': '201710260000', 'count': 6} - {'timePeriod': '201710250000', 'count': 4} - {'timePeriod': '201710240000', 'count': 4} - {'timePeriod': '201710230000', 'count': 0} - {'timePeriod': '201710220000', 'count': 0} - {'timePeriod': '201710210000', 'count': 3} - {'timePeriod': '201710200000', 'count': 2} - {'timePeriod': '201710190000', 'count': 1} - {'timePeriod': '201710180000', 'count': 6} - {'timePeriod': '201710170000', 'count': 2} - {'timePeriod': '201710160000', 'count': 2} - {'timePeriod': '201710150000', 'count': 1} - {'timePeriod': '201710140000', 'count': 64} - {'timePeriod': '201710130000', 'count': 3} - {'timePeriod': '201710120000', 'count': 4} - {'timePeriod': '201710110000', 'count': 8} - {'timePeriod': '201710100000', 'count': 4} - {'timePeriod': '201710090000', 'count': 1} - {'timePeriod': '201710080000', 'count': 0} - {'timePeriod': '201710070000', 'count': 0} - {'timePeriod': '201710060000', 'count': 1} - {'timePeriod': '201710050000', 'count': 3} - {'timePeriod': '201710040000', 'count': 5} - {'timePeriod': '201710030000', 'count': 8} - {'timePeriod': '201710020000', 'count': 5} - {'timePeriod': '201710010000', 'count': 0} - {'timePeriod': '201709300000', 'count': 0} - {'timePeriod': '201709290000', 'count': 0} - {'timePeriod': '201709280000', 'count': 9} - {'timePeriod': '201709270000', 'count': 41} - {'timePeriod': '201709260000', 'count': 13} - {'timePeriod': '201709250000', 'count': 6} - {'timePeriod': '201709240000', 'count': 7} - {'timePeriod': '201709230000', 'count': 3} - {'timePeriod': '201709220000', 'count': 0} - {'timePeriod': '201709210000', 'count': 1} - {'timePeriod': '201709200000', 'count': 7} - -Contributing -============ - -Any contributions should follow the following pattern: - -1. Make a feature or bugfix branch, e.g., - ``git checkout -b my_new_feature`` -2. Make your changes in that branch -3. Ensure you bump the version number in ``searchtweets/_version.py`` to - reflect your changes. We use `Semantic - Versioning `__, so non-breaking enhancements - should increment the minor version, e.g., ``1.5.0 -> 1.6.0``, and - bugfixes will increment the last version, ``1.6.0 -> 1.6.1``. -4. Create a pull request - -After the pull request process is accepted, package maintainers will -handle building documentation and distribution to Pypi. - -For reference, distributing to Pypi is accomplished by the following -commands, ran from the root directory in the repo: - -.. code:: bash - - python setup.py bdist_wheel - python setup.py sdist - twine upload dist/* - -How to build the documentation: - -Building the documentation requires a few Sphinx packages to build the -webpages: - -.. code:: bash - - pip install sphinx - pip install sphinx_bootstrap_theme - pip install sphinxcontrib-napoleon - -Then (once your changes are committed to master) you should be able to -run the documentation-generating bash script and follow the -instructions: - -.. code:: bash - - bash build_sphinx_docs.sh master searchtweets +$ search_tweets.py -h -Note that this README is also generated, and so after any README changes -you'll need to re-build the README (you need pandoc version 2.1+ for -this) and commit the result: +usage: search_tweets.py [-h] [--credential-file CREDENTIAL_FILE] + [--credential-file-key CREDENTIAL_YAML_KEY] -.. code:: bash - bash make_readme.sh From 9597c8339654660fd8c5fc44ea54972648bb0ad8 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 12 May 2020 20:29:32 -0600 Subject: [PATCH 06/83] Update README.rst --- README.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index df86d0b..4f65a51 100644 --- a/README.rst +++ b/README.rst @@ -14,19 +14,19 @@ Features ======== - Supports Labs Recent search, v2. -- Supports a new "polling" mode using the new Labs ```since_id``` search request parameter. The ```since_id```, along with the new ```until_id``` provide a way to navigate the public Tweet archive by Tweet ID. -- Supports additional ways to specify ```start_time``` and ```end_time``` request parameters: +- Supports a new "polling" mode using the new Labs ```since-id``` search request parameter. The ```since-id```, along with the new ```until-id``` provide a way to navigate the public Tweet archive by Tweet ID. +- Supports additional ways to specify ```start-time``` and ```end-time``` request parameters: - - d# - For example, 'd2' sets ```start_time``` to (exactly) two days ago. - - h# - For example, 'h12' sets ```start_time``` to (exactly) twelve hours ago. - - m# - For example, 'm15' sets ```start_time``` to (exactly) fifteen minutes ago. + - d# - For example, 'd2' sets ```start-time``` to (exactly) two days ago. + - h# - For example, 'h12' sets ```start-time``` to (exactly) twelve hours ago. + - m# - For example, 'm15' sets ```start-time``` to (exactly) fifteen minutes ago. - These are handy for kicking off searches with a backfill period, and also work with the ```end_time``` request parameter. + These are handy for kicking off searches with a backfill period, and also work with the ```end-time``` request parameter. + +These features were inherited from the enterprise/premium version: -These features were inherited from the enterprise/premium version. - Command-line utility is pipeable to other tools (e.g., ``jq``). -- Automatically handles pagination of search results with specifiable - limits. +- Automatically handles pagination of search results with specifiable limits. - Delivers a stream of data to the user for low in-memory requirements. - Handles OAuth 2 and Bearer Token authentication. - Flexible usage within a python program. From 3030c351b2d467c96b8b4f5020d9433385b8404d Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 12 May 2020 22:08:53 -0600 Subject: [PATCH 07/83] Update README.rst --- README.rst | 87 +++++++++++++++++++----------------------------------- 1 file changed, 30 insertions(+), 57 deletions(-) diff --git a/README.rst b/README.rst index 4f65a51..e695377 100644 --- a/README.rst +++ b/README.rst @@ -1,27 +1,24 @@ Python client for Labs Recent search ==================================== -This project serves as a wrapper for the `Twitter Labs recent search -APIs `__, -providing a command-line utility and a Python library. +Welcome to the ``labs`` branch of the Python search client. This branch was born from the ``master`` branch that supports premium and enterprise tiers of Twitter search. This branch supports the `Twitter Developer Labs Recent search v2 endpoint `__ only, and drops support for the premium and enterprise tiers. -This is a fork of the premium/enterprise search client at https://github.com/twitterdev/search-tweets-python. - -If you are working with an enterprise or premium 30-day or Full-archive search endpoint, the ```master``` branch of this repository has what you need. +Note: If you are looking for the original version that works with premium and enterprise versions of search, head on over to the ``master`` branch. +This project serves as a wrapper for the, providing a command-line utility and a Python library. Features ======== - Supports Labs Recent search, v2. -- Supports a new "polling" mode using the new Labs ```since-id``` search request parameter. The ```since-id```, along with the new ```until-id``` provide a way to navigate the public Tweet archive by Tweet ID. -- Supports additional ways to specify ```start-time``` and ```end-time``` request parameters: +- Supports a new "polling" mode using the new Labs ``since-id`` search request parameter. The ``since-id``, along with the new ``until-id`` provide a way to navigate the public Tweet archive by Tweet ID. +- Supports additional ways to specify ``start-time`` and ``end-time`` request parameters: - - d# - For example, 'd2' sets ```start-time``` to (exactly) two days ago. - - h# - For example, 'h12' sets ```start-time``` to (exactly) twelve hours ago. - - m# - For example, 'm15' sets ```start-time``` to (exactly) fifteen minutes ago. + - d# - For example, 'd2' sets ``start-time`` to (exactly) two days ago. + - h# - For example, 'h12' sets ``start-time`` to (exactly) twelve hours ago. + - m# - For example, 'm15' sets ``start-time`` to (exactly) fifteen minutes ago. - These are handy for kicking off searches with a backfill period, and also work with the ```end-time``` request parameter. + These are handy for kicking off searches with a backfill period, and also work with the ``end-time`` request parameter. These features were inherited from the enterprise/premium version: @@ -38,7 +35,7 @@ Labs updates When migrating this Python search client from an enterprise or premium search endpoint, the following updates were made: - Added support for GET requests (and removed POST support for now) -- Added support for since_id and until_id request parameters. +- Added support for ``since_id`` and ``until_id`` request parameters. - Updated pagination details. - Updated app command-line parlance - --start-datetime → --start-time @@ -110,51 +107,27 @@ optional arguments: --debug print all info and warning messages -Migrating from enterprise/premium library -========================================= - - - - - - - - - - Installation ============= -{Are there any new conventions?} -Maintaing two packages: -+ searchtweets (current enterprise/premium package) -+ searchtweetslabs -Eventually, there will be searchtweetsv2, and searchtweets will be dropped. - -The searchtweets library is on Pypi: +Currently, there is not an updated Pypi install package for the Labs version. To get started with this code, you'll need to clone the repository, install the required Python packages, set up your credentials, and start making requests. -pip install searchtweets -Or you can install the development version locally via +To confirm the your code is ready to go, run the ``$python3 scripts/search-tweets.py -h`` command. You should see the help details shown above. -git clone https://github.com/twitterdev/search-tweets-python -cd search-tweets-python -pip install -e . Credential Handling -The premium and enterprise Search APIs use different authentication methods and we attempt to provide a seamless way to handle authentication for all customers. We know credentials can be tricky or annoying - please read this in its entirety. - -Premium clients will require the bearer_token and endpoint fields; Enterprise clients require username, password, and endpoint. If you do not specify the account_type, we attempt to discern the account type and declare a warning about this behavior. +=================== -For premium search products, we are using app-only authentication and the bearer tokens are not delivered with an expiration time. You can provide either: - your application key and secret (the library will handle bearer-token authentication) - a bearer token that you get yourself +The Labs Recent search endpoint uses app-only authentication. You have the choice to configure your application consumer key and secret, or a Bearer Token you have generated. If you supply the application key and secret, the client will generate a Bearer Token for you. -Many developers might find providing your application key and secret more straightforward and letting this library manage your bearer token generation for you. Please see here for an overview of the premium authentication method. +Many developers might find providing your application key and secret more straightforward and letting this library manage your Bearer Token generation for you. Please see `HERE `_ for an overview of the app-only authentication method. We support both YAML-file based methods and environment variables for storing credentials, and provide flexible handling with sensible defaults. YAML method -For premium customers, the simplest credential file should look like this: +The simplest credential file should look like this: -search_tweets_endpoint: - endpoint: +search_tweets_api: + endpoint: https://api.twitter.com/labs/2/tweets/search consumer_key: consumer_secret: @@ -165,13 +138,13 @@ Both above examples require no special command-line arguments or in-program argu For developers who have multiple endpoints and/or search products, you can keep all credentials in the same file and specify specific keys to use. --credential-file-key specifies this behavior in the command line app. An example: search_tweets_labsv1: - endpoint: + endpoint: https://api.twitter.com/labs/1/tweets/search consumer_key: consumer_secret: (optional) bearer_token: search_tweets_labsv2: - endpoint: + endpoint: https://api.twitter.com/labs/2/tweets/search consumer_key: consumer_secret: (optional) bearer_token: @@ -179,7 +152,7 @@ search_tweets_labsv2: Environment Variables -If you want or need to pass credentials via environment variables, you can set the appropriate variables for your product of the following: +If you want or need to pass credentials via environment variables, you can set the appropriate variables: export SEARCHTWEETS_ENDPOINT= export SEARCHTWEETS_BEARER_TOKEN= @@ -231,14 +204,14 @@ the flags: are used to control credential behavior from the command-line app. Using the Comand Line Application -The library includes an application, search_tweets.py, that provides rapid access to Tweets. When you use pip to install this package, search_tweets.py is installed globally. The file is located in the tools/ directory for those who want to run it locally. +The library includes an application, search_tweets.py, that provides rapid access to Tweets. When you use pip to install this package, search_tweets.py is installed globally. The file is located in the scripts/ directory for those who want to run it locally. -Note that the --results-per-call flag specifies an argument to the API ( maxResults, results returned per CALL), not as a hard max to number of results returned from this program. The argument --max-results defines the maximum number of results to return from a given call. All examples assume that your credentials are set up correctly in the default location - .twitter_keys.yaml or in environment variables. +Note that the --results-per-call flag specifies an argument to the API, not as a hard max to number of results returned from this program. The argument --max-tweets defines the maximum number of results to return from a single run of the ``search-tweets.py``` script. All examples assume that your credentials are set up correctly in the default location - .twitter_keys.yaml or in environment variables. Stream json results to stdout without saving search_tweets.py \ - --max-results 1000 \ + --max-tweets 1000 \ --results-per-call 100 \ --query "(snow OR rain) has:media -is:retweet" \ --print-stream @@ -256,14 +229,14 @@ search_tweets.py \ --max-results 100 \ --results-per-call 100 \ --query "(snow OR rain) has:media -is:retweet" \ - --filename-prefix beyonce_geo \ + --filename-prefix weather_pic \ --no-print-stream One or more custom headers can be specified from the command line, using the --extra-headers argument and a JSON-formatted string representing a dictionary of extra headers: search_tweets.py \ --query "(snow OR rain) has:media -is:retweet" \ --extra-headers '{"":""}' -Options can be passed via a configuration file (either ini or YAML). Example files can be found in the tools/api_config_example.config or ./tools/api_yaml_example.yaml files, which might look like this: +Options can be passed via a configuration file (either ini or YAML). Example files can be found in the config/api_config_example.config or config/api_yaml_example.yaml files, which might look like this: [search_rules] start_time = 2020-05-01 @@ -282,13 +255,13 @@ results_per_file = 10000000 Or this: search_rules: - start_time: 2017-06-01 - end_time: 2017-09-01 01:01 + start_time: 2020-05-01 + end_time: 2020-05-01 01:01 query: (snow OR rain) has:media -is:retweet search_params: - results-per-call: 100 - max-results: 500 + results_per_call: 100 + max_results: 500 output_params: save_file: True From 963a2848a9f80035ff90c0b52c9b4e3b384c6d79 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 12 May 2020 22:28:03 -0600 Subject: [PATCH 08/83] Update README.rst --- README.rst | 272 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 156 insertions(+), 116 deletions(-) diff --git a/README.rst b/README.rst index e695377..f63e25e 100644 --- a/README.rst +++ b/README.rst @@ -124,12 +124,16 @@ Many developers might find providing your application key and secret more straig We support both YAML-file based methods and environment variables for storing credentials, and provide flexible handling with sensible defaults. YAML method +=========== + The simplest credential file should look like this: -search_tweets_api: - endpoint: https://api.twitter.com/labs/2/tweets/search - consumer_key: - consumer_secret: +.. code:: yaml + + search_tweets_api: + endpoint: https://api.twitter.com/labs/2/tweets/search + consumer_key: + consumer_secret: By default, this library expects this file at "~/.twitter_keys.yaml", but you can pass the relevant location as needed, either with the --credential-file flag for the command-line app or as demonstrated below in a Python program. @@ -137,156 +141,192 @@ Both above examples require no special command-line arguments or in-program argu For developers who have multiple endpoints and/or search products, you can keep all credentials in the same file and specify specific keys to use. --credential-file-key specifies this behavior in the command line app. An example: -search_tweets_labsv1: - endpoint: https://api.twitter.com/labs/1/tweets/search - consumer_key: - consumer_secret: - (optional) bearer_token: +.. code:: yaml -search_tweets_labsv2: - endpoint: https://api.twitter.com/labs/2/tweets/search - consumer_key: - consumer_secret: - (optional) bearer_token: + search_tweets_labsv1: + endpoint: https://api.twitter.com/labs/1/tweets/search + consumer_key: + consumer_secret: + (optional) bearer_token: + search_tweets_labsv2: + endpoint: https://api.twitter.com/labs/2/tweets/search + consumer_key: + consumer_secret: + (optional) bearer_token: Environment Variables +===================== If you want or need to pass credentials via environment variables, you can set the appropriate variables: -export SEARCHTWEETS_ENDPOINT= -export SEARCHTWEETS_BEARER_TOKEN= -export SEARCHTWEETS_CONSUMER_KEY= -export SEARCHTWEETS_CONSUMER_SECRET= +:: -The load_credentials function will attempt to find these variables if it cannot load fields from the YAML file, and it will overwrite any credentials from the YAML file that are present as environment variables if they have been parsed. This behavior can be changed by setting the load_credentials parameter env_overwrite to False. + export SEARCHTWEETS_ENDPOINT= + export SEARCHTWEETS_BEARER_TOKEN= + export SEARCHTWEETS_CONSUMER_KEY= + export SEARCHTWEETS_CONSUMER_SECRET= + +The ``load_credentials`` function will attempt to find these variables if it cannot load fields from the YAML file, and it will **overwrite any credentials from the YAML file that are present as environment variables** if they have been parsed. This behavior can be changed by setting the ``load_credentials`` parameter ``env_overwrite`` to ``False``. The following cells demonstrates credential handling in the Python library. -from searchtweets import load_credentials -load_credentials(filename="./search_tweets_creds_example.yaml", - yaml_key="search_tweets_ent_example", - env_overwrite=False) -{ 'endpoint': ''} - -load_credentials(filename="./search_tweets_creds_example.yaml", - yaml_key="search_tweetsv2_example", - env_overwrite=False) - -{'bearer_token': '', - 'endpoint': 'https://api.twitter.com/labs/2/tweets/search', - 'extra_headers_dict': None} - +.. code:: python + + from searchtweets import load_credentials + +.. code:: python + + load_credentials(filename="./search_tweets_creds_example.yaml", + yaml_key="search_tweets_v2_example", + env_overwrite=False) + +:: + + {'bearer_token': '', + 'endpoint': 'https://api.twitter.com/labs/2/tweets/search', + 'extra_headers_dict': None} Environment Variable Overrides +============================== If we set our environment variables, the program will look for them regardless of a YAML file's validity or existence. -import os -os.environ["SEARCHTWEETS_USERNAME"] = "" -os.environ["SEARCHTWEETS_BEARERTOKEN"] = "" -os.environ["SEARCHTWEETS_ENDPOINT"] = "" +.. code:: python + import os + os.environ["SEARCHTWEETS_USERNAME"] = "" + os.environ["SEARCHTWEETS_BEARERTOKEN"] = "" + os.environ["SEARCHTWEETS_ENDPOINT"] = "" + + load_credentials(filename="nothing_here.yaml", yaml_key="no_key_here") + +:: + cannot read file nothing_here.yaml + Error parsing YAML file; searching for valid environment variables -load_credentials(filename="nothing_here.yaml", yaml_key="no_key_here") -cannot read file nothing_here.yaml +:: -Error parsing YAML file; searching for valid environment variables -{'bearer_token': '', - 'endpoint': ''} + {'bearer_token': '', + 'endpoint': ''} Command-line app +---------------- the flags: ---credential-file ---credential-file-key ---env-overwrite +- ``--credential-file `` +- ``--credential-file-key `` +- ``--env-overwrite`` + are used to control credential behavior from the command-line app. +---------------- + Using the Comand Line Application -The library includes an application, search_tweets.py, that provides rapid access to Tweets. When you use pip to install this package, search_tweets.py is installed globally. The file is located in the scripts/ directory for those who want to run it locally. - -Note that the --results-per-call flag specifies an argument to the API, not as a hard max to number of results returned from this program. The argument --max-tweets defines the maximum number of results to return from a single run of the ``search-tweets.py``` script. All examples assume that your credentials are set up correctly in the default location - .twitter_keys.yaml or in environment variables. - -Stream json results to stdout without saving - -search_tweets.py \ - --max-tweets 1000 \ - --results-per-call 100 \ - --query "(snow OR rain) has:media -is:retweet" \ - --print-stream -Stream json results to stdout and save to a file - -search_tweets.py \ - --max-results 1000 \ - --results-per-call 100 \ - --query "(snow OR rain) has:media -is:retweet" \ - --filename-prefix beyonce_geo \ - --print-stream -Save to file without output - -search_tweets.py \ - --max-results 100 \ - --results-per-call 100 \ - --query "(snow OR rain) has:media -is:retweet" \ - --filename-prefix weather_pic \ - --no-print-stream -One or more custom headers can be specified from the command line, using the --extra-headers argument and a JSON-formatted string representing a dictionary of extra headers: - -search_tweets.py \ - --query "(snow OR rain) has:media -is:retweet" \ - --extra-headers '{"":""}' -Options can be passed via a configuration file (either ini or YAML). Example files can be found in the config/api_config_example.config or config/api_yaml_example.yaml files, which might look like this: - -[search_rules] -start_time = 2020-05-01 -end_time = 2020-05-01 -query = (snow OR rain) has:media -is:retweet - -[search_params] -results_per_call = 100 -max_tweets = 10000 - -[output_params] -save_file = True -filename_prefix = weather-pics -results_per_file = 10000000 +================================= + +The library includes an application, ``search_tweets.py``, that provides rapid access to Tweets. When you use ``pip`` to install this package, ``search_tweets.py`` is installed globally. The file is located in the ``scripts/`` directory for those who want to run it locally. + +Note that the ``--results-per-call`` flag specifies an argument to the API, not as a hard max to number of results returned from this program. The argument ``--max-tweets`` defines the maximum number of results to return from a single run of the ``search-tweets.py``` script. All examples assume that your credentials are set up correctly in the default location - ``.twitter_keys.yaml`` or in environment variables. + +**Stream json results to stdout without saving** + +.. code:: bash + + search_tweets.py \ + --max-tweets 1000 \ + --results-per-call 100 \ + --query "(snow OR rain) has:media -is:retweet" \ + --print-stream + +**Stream json results to stdout and save to a file** + +.. code:: bash + + search_tweets.py \ + --max-results 1000 \ + --results-per-call 100 \ + --query "(snow OR rain) has:media -is:retweet" \ + --filename-prefix beyonce_geo \ + --print-stream + +**Save to file without output** + +.. code:: bash + + search_tweets.py \ + --max-results 100 \ + --results-per-call 100 \ + --query "(snow OR rain) has:media -is:retweet" \ + --filename-prefix weather_pic \ + --no-print-stream + +One or more custom headers can be specified from the command line, using the ``--extra-headers`` argument and a JSON-formatted string representing a dictionary of extra headers: + +.. code:: bash + + search_tweets.py \ + --query "(snow OR rain) has:media -is:retweet" \ + --extra-headers '{"":""}' + +Options can be passed via a configuration file (either ini or YAML). Example files can be found in the ``config/api_config_example.config`` or ``config/api_yaml_example.yaml`` files, which might look like this: + +.. code:: bash + + [search_rules] + start_time = 2020-05-01 + end_time = 2020-05-01 + query = (snow OR rain) has:media -is:retweet + + [search_params] + results_per_call = 100 + max_tweets = 10000 + + [output_params] + save_file = True + filename_prefix = weather-pics + results_per_file = 10000000 Or this: -search_rules: - start_time: 2020-05-01 - end_time: 2020-05-01 01:01 - query: (snow OR rain) has:media -is:retweet +.. code:: bash + + search_rules: + start_time: 2020-05-01 + end_time: 2020-05-01 01:01 + query: (snow OR rain) has:media -is:retweet -search_params: - results_per_call: 100 - max_results: 500 + search_params: + results_per_call: 100 + max_results: 500 + + output_params: + save_file: True + filename_prefix: (snow OR rain) has:media -is:retweet + results_per_file: 10000000 -output_params: - save_file: True - filename_prefix: (snow OR rain) has:media -is:retweet - results_per_file: 10000000 Custom headers can be specified in a config file, under a specific credentials key: -search_tweets_api: - endpoint: - bearer_token: - extra_headers: - : -When using a config file in conjunction with the command-line utility, you need to specify your config file via the --config-file parameter. Additional command-line arguments will either be added to the config file args or overwrite the config file args if both are specified and present. +.. code:: yaml + + search_tweets_api: + endpoint: + bearer_token: + extra_headers: + : + +When using a config file in conjunction with the command-line utility, you need to specify your config file via the ``--config-file`` parameter. Additional command-line arguments will either be added to the config file args or overwrite the config file args if both are specified and present. Example: -search_tweets.py \ - --config-file myapiconfig.config \ - --no-print-stream -Full options are listed below: +:: -$ search_tweets.py -h + search_tweets.py \ + --config-file myapiconfig.config \ + --no-print-stream -usage: search_tweets.py [-h] [--credential-file CREDENTIAL_FILE] - [--credential-file-key CREDENTIAL_YAML_KEY] +------------------ + From a2dc6af39822aa05a322eaafd19ddbd934ed0b83 Mon Sep 17 00:00:00 2001 From: Andy Piper Date: Wed, 13 May 2020 16:31:17 +0100 Subject: [PATCH 09/83] Update README.rst Add Labs version badges --- README.rst | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/README.rst b/README.rst index f63e25e..dee9b97 100644 --- a/README.rst +++ b/README.rst @@ -1,7 +1,15 @@ Python client for Labs Recent search ==================================== -Welcome to the ``labs`` branch of the Python search client. This branch was born from the ``master`` branch that supports premium and enterprise tiers of Twitter search. This branch supports the `Twitter Developer Labs Recent search v2 endpoint `__ only, and drops support for the premium and enterprise tiers. +.. image:: https://img.shields.io/static/v1?label=Twitter%20API&message=Developer%20Labs%20v1&color=794BC4&style=flat&logo=Twitter + :target: https://developer.twitter.com/en/docs/labs/overview/versioning + :alt: Labs v1 + +.. image:: https://img.shields.io/static/v1?label=Twitter%20API&message=Developer%20Labs%20v2&color=794BC4&style=flat&logo=Twitter + :target: https://developer.twitter.com/en/docs/labs/overview/versioning + :alt: Labs v2 + +Welcome to the ``labs`` branch of the Python search client. This branch was born from the ``master`` branch that supports premium and enterprise tiers of Twitter search. This branch supports the `Twitter Developer Labs Recent search v2 endpoint `__ only, and drops support for the premium and enterprise tiers. Note: If you are looking for the original version that works with premium and enterprise versions of search, head on over to the ``master`` branch. @@ -10,15 +18,15 @@ This project serves as a wrapper for the, providing a command-line utility and a Features ======== -- Supports Labs Recent search, v2. -- Supports a new "polling" mode using the new Labs ``since-id`` search request parameter. The ``since-id``, along with the new ``until-id`` provide a way to navigate the public Tweet archive by Tweet ID. +- Supports Labs Recent search, v2. +- Supports a new "polling" mode using the new Labs ``since-id`` search request parameter. The ``since-id``, along with the new ``until-id`` provide a way to navigate the public Tweet archive by Tweet ID. - Supports additional ways to specify ``start-time`` and ``end-time`` request parameters: - - d# - For example, 'd2' sets ``start-time`` to (exactly) two days ago. - - h# - For example, 'h12' sets ``start-time`` to (exactly) twelve hours ago. - - m# - For example, 'm15' sets ``start-time`` to (exactly) fifteen minutes ago. - - These are handy for kicking off searches with a backfill period, and also work with the ``end-time`` request parameter. + - d# - For example, 'd2' sets ``start-time`` to (exactly) two days ago. + - h# - For example, 'h12' sets ``start-time`` to (exactly) twelve hours ago. + - m# - For example, 'm15' sets ``start-time`` to (exactly) fifteen minutes ago. + + These are handy for kicking off searches with a backfill period, and also work with the ``end-time`` request parameter. These features were inherited from the enterprise/premium version: @@ -44,12 +52,12 @@ When migrating this Python search client from an enterprise or premium search en - --max-results → --max-tweets - Dropped --account-type. - Dropped --count-bucket. Removed search 'counts' endpoint support. This endpoint is currently not available in Labs. - + Command-line options ===================== -usage: search_tweets.py +usage: search_tweets.py optional arguments: -h, --help show this help message and exit @@ -110,14 +118,14 @@ optional arguments: Installation ============= -Currently, there is not an updated Pypi install package for the Labs version. To get started with this code, you'll need to clone the repository, install the required Python packages, set up your credentials, and start making requests. +Currently, there is not an updated Pypi install package for the Labs version. To get started with this code, you'll need to clone the repository, install the required Python packages, set up your credentials, and start making requests. To confirm the your code is ready to go, run the ``$python3 scripts/search-tweets.py -h`` command. You should see the help details shown above. Credential Handling =================== -The Labs Recent search endpoint uses app-only authentication. You have the choice to configure your application consumer key and secret, or a Bearer Token you have generated. If you supply the application key and secret, the client will generate a Bearer Token for you. +The Labs Recent search endpoint uses app-only authentication. You have the choice to configure your application consumer key and secret, or a Bearer Token you have generated. If you supply the application key and secret, the client will generate a Bearer Token for you. Many developers might find providing your application key and secret more straightforward and letting this library manage your Bearer Token generation for you. Please see `HERE `_ for an overview of the app-only authentication method. @@ -180,13 +188,13 @@ The following cells demonstrates credential handling in the Python library. load_credentials(filename="./search_tweets_creds_example.yaml", yaml_key="search_tweets_v2_example", env_overwrite=False) - + :: - + {'bearer_token': '', 'endpoint': 'https://api.twitter.com/labs/2/tweets/search', 'extra_headers_dict': None} - + Environment Variable Overrides ============================== @@ -199,7 +207,7 @@ If we set our environment variables, the program will look for them regardless o os.environ["SEARCHTWEETS_ENDPOINT"] = "" load_credentials(filename="nothing_here.yaml", yaml_key="no_key_here") - + :: cannot read file nothing_here.yaml Error parsing YAML file; searching for valid environment variables @@ -238,7 +246,7 @@ Note that the ``--results-per-call`` flag specifies an argument to the API, not --results-per-call 100 \ --query "(snow OR rain) has:media -is:retweet" \ --print-stream - + **Stream json results to stdout and save to a file** .. code:: bash @@ -249,7 +257,7 @@ Note that the ``--results-per-call`` flag specifies an argument to the API, not --query "(snow OR rain) has:media -is:retweet" \ --filename-prefix beyonce_geo \ --print-stream - + **Save to file without output** .. code:: bash @@ -260,7 +268,7 @@ Note that the ``--results-per-call`` flag specifies an argument to the API, not --query "(snow OR rain) has:media -is:retweet" \ --filename-prefix weather_pic \ --no-print-stream - + One or more custom headers can be specified from the command line, using the ``--extra-headers`` argument and a JSON-formatted string representing a dictionary of extra headers: .. code:: bash @@ -268,7 +276,7 @@ One or more custom headers can be specified from the command line, using the ``- search_tweets.py \ --query "(snow OR rain) has:media -is:retweet" \ --extra-headers '{"":""}' - + Options can be passed via a configuration file (either ini or YAML). Example files can be found in the ``config/api_config_example.config`` or ``config/api_yaml_example.yaml`` files, which might look like this: .. code:: bash @@ -326,7 +334,7 @@ Example: --no-print-stream ------------------ - + From c427f0e6c8d79482e491fbb22516baafa443fc68 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Wed, 13 May 2020 09:38:02 -0600 Subject: [PATCH 10/83] Update README.rst --- README.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/README.rst b/README.rst index dee9b97..07237b9 100644 --- a/README.rst +++ b/README.rst @@ -1,10 +1,6 @@ Python client for Labs Recent search ==================================== -.. image:: https://img.shields.io/static/v1?label=Twitter%20API&message=Developer%20Labs%20v1&color=794BC4&style=flat&logo=Twitter - :target: https://developer.twitter.com/en/docs/labs/overview/versioning - :alt: Labs v1 - .. image:: https://img.shields.io/static/v1?label=Twitter%20API&message=Developer%20Labs%20v2&color=794BC4&style=flat&logo=Twitter :target: https://developer.twitter.com/en/docs/labs/overview/versioning :alt: Labs v2 From 23721b39766fad6b6dd77c3f49129caa56960dbf Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Wed, 13 May 2020 09:40:05 -0600 Subject: [PATCH 11/83] Update README.rst --- README.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 07237b9..20ebf30 100644 --- a/README.rst +++ b/README.rst @@ -1,15 +1,15 @@ -Python client for Labs Recent search -==================================== - .. image:: https://img.shields.io/static/v1?label=Twitter%20API&message=Developer%20Labs%20v2&color=794BC4&style=flat&logo=Twitter :target: https://developer.twitter.com/en/docs/labs/overview/versioning :alt: Labs v2 +Python client for Labs Recent search +==================================== + Welcome to the ``labs`` branch of the Python search client. This branch was born from the ``master`` branch that supports premium and enterprise tiers of Twitter search. This branch supports the `Twitter Developer Labs Recent search v2 endpoint `__ only, and drops support for the premium and enterprise tiers. Note: If you are looking for the original version that works with premium and enterprise versions of search, head on over to the ``master`` branch. -This project serves as a wrapper for the, providing a command-line utility and a Python library. +This project serves as a wrapper for the Labs Recent search endpoint, providing a command-line utility and a Python library. Features ======== From 0cba4d752fb4e2ecbf3ba9139503ef1c94235acf Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 19 May 2020 20:39:18 -0600 Subject: [PATCH 12/83] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bcf62d2..c0a434c 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ def parse_version(str_): if line.startswith("VERSION")][0].strip() VERSION = parse_version(_version_line) -setup(name='searchtweets', +setup(name='searchtweets-labs', description="Wrapper for Twitter Developer Labs Recent search endpoint.", url='https://github.com/twitterdev/search-tweets-python', author='Fiona Pigott, Jeff Kolb, Josh Montague, Aaron Gonzales, Jim Moffitt', From 8cf557adbaee120438b5acd3ef11abe611945efc Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 19 May 2020 20:47:45 -0600 Subject: [PATCH 13/83] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c0a434c..5d51359 100644 --- a/setup.py +++ b/setup.py @@ -33,5 +33,5 @@ def parse_version(str_): python_requires='>=3.3', install_requires=["requests", "tweet_parser", "pyyaml"], packages=find_packages(), - scripts=["tools/search_tweets.py","tools/polling_app.py"], + scripts=["scripts/search_tweets.py","scripts/polling_app.py"], ) From f9ae17dd0d7568db584218f31d2d65e0f3b5d953 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 19 May 2020 20:48:18 -0600 Subject: [PATCH 14/83] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5d51359..040d2ff 100644 --- a/setup.py +++ b/setup.py @@ -33,5 +33,5 @@ def parse_version(str_): python_requires='>=3.3', install_requires=["requests", "tweet_parser", "pyyaml"], packages=find_packages(), - scripts=["scripts/search_tweets.py","scripts/polling_app.py"], + scripts=["scripts/search_tweets.py","scripts/poll_app.py"], ) From fa80978d5b6dcb4f57f2afb86003edfed9107628 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 19 May 2020 20:49:15 -0600 Subject: [PATCH 15/83] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 040d2ff..cfd8c14 100644 --- a/setup.py +++ b/setup.py @@ -33,5 +33,5 @@ def parse_version(str_): python_requires='>=3.3', install_requires=["requests", "tweet_parser", "pyyaml"], packages=find_packages(), - scripts=["scripts/search_tweets.py","scripts/poll_app.py"], + scripts=["scripts/search_tweets.py","scripts/poll_tweets.py"], ) From fbe00aaa16556dc6f577f8630f88d17d3d50df27 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Wed, 3 Jun 2020 14:02:20 -0600 Subject: [PATCH 16/83] Update README.rst --- README.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 20ebf30..ec48697 100644 --- a/README.rst +++ b/README.rst @@ -11,6 +11,8 @@ Note: If you are looking for the original version that works with premium and en This project serves as a wrapper for the Labs Recent search endpoint, providing a command-line utility and a Python library. +To download and install this package, go to: https://pypi.org/project/searchtweets-labs/ + Features ======== @@ -46,7 +48,7 @@ When migrating this Python search client from an enterprise or premium search en - --end-datetime → --end-time - --filter-rule → --query - --max-results → --max-tweets - - Dropped --account-type. + - Dropped --account-type. No longer required since support for Premium and Enterprise search tiers have been dropped. - Dropped --count-bucket. Removed search 'counts' endpoint support. This endpoint is currently not available in Labs. @@ -114,7 +116,11 @@ optional arguments: Installation ============= -Currently, there is not an updated Pypi install package for the Labs version. To get started with this code, you'll need to clone the repository, install the required Python packages, set up your credentials, and start making requests. +The updated Pypi install package for the Labs version is at: + +https://pypi.org/project/searchtweets-labs/ + +Another option to work directly with this code by cloning the repository, installing the required Python packages, setting up your credentials, and start making requests. To confirm the your code is ready to go, run the ``$python3 scripts/search-tweets.py -h`` command. You should see the help details shown above. From 0c08b93a812ee1fbda0fd30e787d5559e720c9b6 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Mon, 8 Jun 2020 17:48:44 -0600 Subject: [PATCH 17/83] JM: fixed a bug with handling YYYY-MM-DD HH:mm dates. --- searchtweets/api_utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/searchtweets/api_utils.py b/searchtweets/api_utils.py index b452b25..bb7d64f 100644 --- a/searchtweets/api_utils.py +++ b/searchtweets/api_utils.py @@ -4,7 +4,7 @@ # https://opensource.org/licenses/MIT """ Module containing the various functions that are used for API calls, -rule generation, and related. +request payload generation, and related. """ import re @@ -32,15 +32,15 @@ def convert_utc_time(datetime_str): - YYYY-mm-DD - YYYY-mm-DD HH:MM - YYYY-mm-DDTHH:MM - #Coming soon: - - 3d - -12h + - 3d (set start_time to three days ago) + - 12h (set start_time to twelve hours ago) + - 15m (set start_time to fifteen minutes ago) Args: datetime_str (str): valid formats are listed above. Returns: - string of GNIP API formatted date. + string of ISO formatted date. Example: >>> from searchtweets.utils import convert_utc_time @@ -73,6 +73,9 @@ def convert_utc_time(datetime_str): # command line with 'T' datetime_str = datetime_str.replace('T', ' ') _date = datetime.datetime.strptime(datetime_str, "%Y-%m-%d %H:%M") + else: + _date = datetime.datetime.strptime(datetime_str, "%Y-%m-%d %H:%M") + except ValueError: _date = datetime.datetime.strptime(datetime_str, "%Y-%m-%d") From 03fea84aac72aa338edaab5631ee6d353c718d70 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 9 Jun 2020 22:40:49 -0600 Subject: [PATCH 18/83] Update README.rst --- README.rst | 220 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 205 insertions(+), 15 deletions(-) diff --git a/README.rst b/README.rst index ec48697..8531ec5 100644 --- a/README.rst +++ b/README.rst @@ -51,6 +51,10 @@ When migrating this Python search client from an enterprise or premium search en - Dropped --account-type. No longer required since support for Premium and Enterprise search tiers have been dropped. - Dropped --count-bucket. Removed search 'counts' endpoint support. This endpoint is currently not available in Labs. +In this spirit of updating the parlance used, note that the a core method provided by searchtweets/result_stream.py has been renamed. The method gen_rule_payload has been updated to gen_request_parameters. + +Finally, the original version of search-tweets-python used a `Tweet Parser `__ to help manage the differences between two different JSON formats ("original" and "Activity Stream"). With Labs, there is just one version of Tweet JSON, so this Tweet Parser is not used. In the original code, this Tweet parser was envoked with a 'tweetify=True' directive. With this Labs version, this use of the Tweet Parser is turned off by instead using 'tweetify=False'. + Command-line options ===================== @@ -198,39 +202,41 @@ The following cells demonstrates credential handling in the Python library. 'extra_headers_dict': None} Environment Variable Overrides -============================== +------------------------------ -If we set our environment variables, the program will look for them regardless of a YAML file's validity or existence. +If we set our environment variables, the program will look for them +regardless of a YAML file's validity or existence. .. code:: python - import os - os.environ["SEARCHTWEETS_USERNAME"] = "" - os.environ["SEARCHTWEETS_BEARERTOKEN"] = "" - os.environ["SEARCHTWEETS_ENDPOINT"] = "" - load_credentials(filename="nothing_here.yaml", yaml_key="no_key_here") + import os + os.environ["SEARCHTWEETS_BEARER_TOKEN"] = "" + os.environ["SEARCHTWEETS_ENDPOINT"] = "" + + load_credentials(filename="nothing_here.yaml", yaml_key="no_key_here") :: - cannot read file nothing_here.yaml - Error parsing YAML file; searching for valid environment variables + + cannot read file nothing_here.yaml + Error parsing YAML file; searching for valid environment variables :: - {'bearer_token': '', - 'endpoint': ''} + {'bearer_token': '', + 'endpoint': ''} Command-line app ---------------- the flags: -- ``--credential-file `` -- ``--credential-file-key `` -- ``--env-overwrite`` +- ``--credential-file `` +- ``--credential-file-key `` +- ``--env-overwrite`` are used to control credential behavior from the command-line app. ----------------- +-------------- Using the Comand Line Application ================================= @@ -337,6 +343,190 @@ Example: ------------------ +Using the Twitter Search APIs' Python Wrapper +============================================= + +Working with the API within a Python program is straightforward. + +We'll assume that credentials are in the default location, +``~/.twitter_keys.yaml``. + +.. code:: python + + from searchtweets import ResultStream, gen_request_parameters, load_credentials + + +Labs Setup +------------- + +.. code:: python + + labs_search_args = load_credentials("~/.twitter_keys.yaml", + yaml_key="search_tweets_labs", + env_overwrite=False) + + +There is a function that formats search API rules into valid json queries called ``gen_request_parameters``. It has sensible defaults, such as pulling more Tweets per call than the default 10, and not including dates. Discussing the finer points of +generating search rules is out of scope for these examples; we encourage you to see the docs to learn the nuances within, but for now let's see what a query looks like. + +.. code:: python + + rule = gen_request_requests("snow", results_per_call=100) + print(rule) + +:: + + {"query":"snow","max_results":100} + +This rule will match tweets that have the text ``snow`` in them. + +From this point, there are two ways to interact with the API. There is a quick method to collect smaller amounts of Tweets to memory that requires less thought and knowledge, and interaction with the ``ResultStream`` object which will be introduced later. + +Fast Way +-------- + +We'll use the ``search_args`` variable to power the configuration point for the API. The object also takes a valid search query and has options to cutoff search when hitting limits on both number of Tweets and endpoint calls. + +We'll be using the ``collect_results`` function, which has three parameters. + +- query: a valid search query, referenced earlier +- max_results: as the API handles pagination, it will stop collecting + when we get to this number +- result_stream_args: configuration args that we've already specified. + +Let's see how it goes: + +.. code:: python + + from searchtweets import collect_results + +.. code:: python + + tweets = collect_results(query, + max_results=100, + result_stream_args=labs_search_args) # change this if you need to + +An overwhelming number of Tweet attributes are made available directly, as such: + +.. code:: python + + [print(tweet.text, end='\n\n') for tweet in tweets[0:10]]; + +:: + "@CleoLoughlin Rain after the snow? Do you have ice now?"} + "@koofltxr Rain, 134340, still with you, winter bear, Seoul, crystal snow, sea, outro:blueside"} + "@TheWxMeister Sorry it ruined your camping. I was covering plants in case we got snow in the Mountain Shadows area. Thankfully we didn\u2019t. At least it didn\u2019t stick to the ground. The wind was crazy! Got just over an inch of rain. Looking forward to better weather."} + "@brettlorenzen And, the reliability of \u201cNeither snow nor rain nor heat nor gloom of night stays these couriers (the #USPS) from the swift completion of their appointed rounds.\u201d"} + "\"Because black people get killed in the rain, black lives matter in the rain. It matters all the time. Snow, rain, sleet, sunny days. We're not out here because it's sunny. We're not out here for fun. We're out here because black lives matter.\" @wisn12news https://t.co/3kZZ7q2MR9"} + "Some of the master copies of the film \u201cGone With the Wind\u201d are archived at the @librarycongress near \u201cSnow White and the Seven Dwarfs\u201d and \u201cSingin\u2019 in the Rain.\u201d GWTW isn\u2019t going to vanish off the face of the earth."} + "Snow Man\u306eD.D.\u3068\nSixTONES\u306eImitation Rain\n\u6d41\u308c\u305f\u301c"} + "@Nonvieta Yup I work in the sanitation industry. I'm in the office however. Life would not go on without our garbage men and women out there. All day everyday rain snow or shine they out there."} + "This picture of a rainbow in WA proves nothing. How do we know if this rainbow was not on Mars or the ISS? Maybe it was drawn in on the picture. WA has mail-in voting so we do have to worry aboug rain, snow, poll workers not showing up or voting machines broke on election day !! https://t.co/5WdHx0acS0 https://t.co/BEKtTpBW9g"} + "Weather in Oslo at 06:00: Clear Temp: 10.6\u00b0C Min today: 9.1\u00b0C Rain today:0.0mm Snow now: 0.0cm Wind N Conditions: Clear Daylight:18:39 hours Sunset: 22:36"} + +Voila, we have some Tweets. For interactive environments and other cases where you don't care about collecting your data in a single load or don't need to operate on the stream of Tweets directly, I recommend using this convenience function. + +Working with the ResultStream +----------------------------- + +The ResultStream object will be powered by the ``search_args``, and takes the query and other configuration parameters, including a hard stop on number of pages to limit your API call usage. + +.. code:: python + + rs = ResultStream(query=query, + max_results=500, + max_pages=1, + **labs_search_args) + + print(rs) + + :: + + ResultStream: + { + "endpoint":"https:\/\/fanyv88.com:443\/https\/api.twitter.com\/labs\/2\/tweets\/search", + "request_parameters":{ + "query":"snow", + "max_results":100 + }, + "tweetify":false, + "max_results":1000 + } + +There is a function, ``.stream``, that seamlessly handles requests and pagination for a given query. It returns a generator, and to grab our 1000 Tweets that mention ``snow`` we can do this: +.. code:: python + + tweets = list(rs.stream()) + +.. code:: python + + # using unidecode to prevent emoji/accents printing + [print(tweet) for tweet in tweets[0:10]]; + +:: + +{"id": "1270572563505254404", "text": "@CleoLoughlin Rain after the snow? Do you have ice now?"} +{"id": "1270570767038599168", "text": "@koofltxr Rain, 134340, still with you, winter bear, Seoul, crystal snow, sea, outro:blueside"} +{"id": "1270570621282340864", "text": "@TheWxMeister Sorry it ruined your camping. I was covering plants in case we got snow in the Mountain Shadows area. Thankfully we didn\u2019t. At least it didn\u2019t stick to the ground. The wind was crazy! Got just over an inch of rain. Looking forward to better weather."} +{"id": "1270569070287630337", "text": "@brettlorenzen And, the reliability of \u201cNeither snow nor rain nor heat nor gloom of night stays these couriers (the #USPS) from the swift completion of their appointed rounds.\u201d"} +{"id": "1270568690447257601", "text": "\"Because black people get killed in the rain, black lives matter in the rain. It matters all the time. Snow, rain, sleet, sunny days. We're not out here because it's sunny. We're not out here for fun. We're out here because black lives matter.\" @wisn12news https://t.co/3kZZ7q2MR9"} +{"id": "1270568607605575680", "text": "Some of the master copies of the film \u201cGone With the Wind\u201d are archived at the @librarycongress near \u201cSnow White and the Seven Dwarfs\u201d and \u201cSingin\u2019 in the Rain.\u201d GWTW isn\u2019t going to vanish off the face of the earth."} +{"id": "1270568437916426240", "text": "Snow Man\u306eD.D.\u3068\nSixTONES\u306eImitation Rain\n\u6d41\u308c\u305f\u301c"} +{"id": "1270568195519373313", "text": "@Nonvieta Yup I work in the sanitation industry. I'm in the office however. Life would not go on without our garbage men and women out there. All day everyday rain snow or shine they out there."} +{"id": "1270567737283117058", "text": "This picture of a rainbow in WA proves nothing. How do we know if this rainbow was not on Mars or the ISS? Maybe it was drawn in on the picture. WA has mail-in voting so we do have to worry aboug rain, snow, poll workers not showing up or voting machines broke on election day !! https://t.co/5WdHx0acS0 https://t.co/BEKtTpBW9g"} +{"id": "1270566386524356608", "text": "Weather in Oslo at 06:00: Clear Temp: 10.6\u00b0C Min today: 9.1\u00b0C Rain today:0.0mm Snow now: 0.0cm Wind N Conditions: Clear Daylight:18:39 hours Sunset: 22:36"} + +Contributing +============ +Any contributions should follow the following pattern: + +1. Make a feature or bugfix branch, e.g., + ``git checkout -b my_new_feature`` +2. Make your changes in that branch +3. Ensure you bump the version number in ``searchtweets/_version.py`` to + reflect your changes. We use `Semantic + Versioning `__, so non-breaking enhancements + should increment the minor version, e.g., ``1.5.0 -> 1.6.0``, and + bugfixes will increment the last version, ``1.6.0 -> 1.6.1``. +4. Create a pull request + +After the pull request process is accepted, package maintainers will +handle building documentation and distribution to Pypi. + +For reference, distributing to Pypi is accomplished by the following +commands, ran from the root directory in the repo: + +.. code:: bash + + python setup.py bdist_wheel + python setup.py sdist + twine upload dist/* + +How to build the documentation: + +Building the documentation requires a few Sphinx packages to build the +webpages: + +.. code:: bash + + pip install sphinx + pip install sphinx_bootstrap_theme + pip install sphinxcontrib-napoleon + +Then (once your changes are committed to master) you should be able to +run the documentation-generating bash script and follow the +instructions: + +.. code:: bash + + bash build_sphinx_docs.sh master searchtweets + +Note that this README is also generated, and so after any README changes +you'll need to re-build the README (you need pandoc version 2.1+ for +this) and commit the result: + +.. code:: bash + bash make_readme.sh From 7798b0c1344a89e4610b6c72fb7d51967ed8a30c Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 9 Jun 2020 22:44:33 -0600 Subject: [PATCH 19/83] Update README.rst --- README.rst | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/README.rst b/README.rst index 8531ec5..9e6c2b3 100644 --- a/README.rst +++ b/README.rst @@ -413,16 +413,26 @@ An overwhelming number of Tweet attributes are made available directly, as such: [print(tweet.text, end='\n\n') for tweet in tweets[0:10]]; :: - "@CleoLoughlin Rain after the snow? Do you have ice now?"} - "@koofltxr Rain, 134340, still with you, winter bear, Seoul, crystal snow, sea, outro:blueside"} - "@TheWxMeister Sorry it ruined your camping. I was covering plants in case we got snow in the Mountain Shadows area. Thankfully we didn\u2019t. At least it didn\u2019t stick to the ground. The wind was crazy! Got just over an inch of rain. Looking forward to better weather."} - "@brettlorenzen And, the reliability of \u201cNeither snow nor rain nor heat nor gloom of night stays these couriers (the #USPS) from the swift completion of their appointed rounds.\u201d"} - "\"Because black people get killed in the rain, black lives matter in the rain. It matters all the time. Snow, rain, sleet, sunny days. We're not out here because it's sunny. We're not out here for fun. We're out here because black lives matter.\" @wisn12news https://t.co/3kZZ7q2MR9"} - "Some of the master copies of the film \u201cGone With the Wind\u201d are archived at the @librarycongress near \u201cSnow White and the Seven Dwarfs\u201d and \u201cSingin\u2019 in the Rain.\u201d GWTW isn\u2019t going to vanish off the face of the earth."} - "Snow Man\u306eD.D.\u3068\nSixTONES\u306eImitation Rain\n\u6d41\u308c\u305f\u301c"} - "@Nonvieta Yup I work in the sanitation industry. I'm in the office however. Life would not go on without our garbage men and women out there. All day everyday rain snow or shine they out there."} - "This picture of a rainbow in WA proves nothing. How do we know if this rainbow was not on Mars or the ISS? Maybe it was drawn in on the picture. WA has mail-in voting so we do have to worry aboug rain, snow, poll workers not showing up or voting machines broke on election day !! https://t.co/5WdHx0acS0 https://t.co/BEKtTpBW9g"} - "Weather in Oslo at 06:00: Clear Temp: 10.6\u00b0C Min today: 9.1\u00b0C Rain today:0.0mm Snow now: 0.0cm Wind N Conditions: Clear Daylight:18:39 hours Sunset: 22:36"} + + @CleoLoughlin Rain after the snow? Do you have ice now? + + @koofltxr Rain, 134340, still with you, winter bear, Seoul, crystal snow, sea, outro:blueside + + @TheWxMeister Sorry it ruined your camping. I was covering plants in case we got snow in the Mountain Shadows area. Thankfully we didn\u2019t. At least it didn\u2019t stick to the ground. The wind was crazy! Got just over an inch of rain. Looking forward to better weather. + + @brettlorenzen And, the reliability of \u201cNeither snow nor rain nor heat nor gloom of night stays these couriers (the #USPS) from the swift completion of their appointed rounds.\u201d + + Because black people get killed in the rain, black lives matter in the rain. It matters all the time. Snow, rain, sleet, sunny days. We're not out here because it's sunny. We're not out here for fun. We're out here because black lives matter. + + Some of the master copies of the film \u201cGone With the Wind\u201d are archived at the @librarycongress near \u201cSnow White and the Seven Dwarfs\u201d and \u201cSingin\u2019 in the Rain.\u201d GWTW isn\u2019t going to vanish off the face of the earth. + + Snow Man\u306eD.D.\u3068\nSixTONES\u306eImitation Rain\n\u6d41\u308c\u305f\u301c + + @Nonvieta Yup I work in the sanitation industry. I'm in the office however. Life would not go on without our garbage men and women out there. All day everyday rain snow or shine they out there. + + This picture of a rainbow in WA proves nothing. How do we know if this rainbow was not on Mars or the ISS? Maybe it was drawn in on the picture. WA has mail-in voting so we do have to worry aboug rain, snow, poll workers not showing up or voting machines broke on election day !! https://t.co/5WdHx0acS0 https://t.co/BEKtTpBW9g + + Weather in Oslo at 06:00: Clear Temp: 10.6\u00b0C Min today: 9.1\u00b0C Rain today:0.0mm Snow now: 0.0cm Wind N Conditions: Clear Daylight:18:39 hours Sunset: 22:36 Voila, we have some Tweets. For interactive environments and other cases where you don't care about collecting your data in a single load or don't need to operate on the stream of Tweets directly, I recommend using this convenience function. From 8a410d9131f6445b03b32e5be686344e5e3030f2 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Wed, 10 Jun 2020 11:11:25 -0600 Subject: [PATCH 20/83] Update README.rst --- README.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 9e6c2b3..7f65fd3 100644 --- a/README.rst +++ b/README.rst @@ -38,7 +38,9 @@ These features were inherited from the enterprise/premium version: Labs updates ============ -When migrating this Python search client from an enterprise or premium search endpoint, the following updates were made: +Twitter Developer Labs represents an opportunity to apply previous learnings from building Twitter API v1.1. and the premium and enterprise tiers of endpoints, and redesign and rebuild from the ground up. While building this Labs version of the `search-tweets-python` library, we took the opportunity to update fundamental things. This library provides example scripts, and one example is updating their command-line arguments to better match new Labs conventions. Instead of setting search periods with `start-datetime` and `end-datetime`, they have been shortened to match current search request parameters: `start-time` and `end-time`. Throughout the code , we no longer use parlance that references `rules` and `PowerTrack`, and now reference `queries` and the Labs Recent search endpoint. + +When migrating this Python search client to Labs from the enterprise and premium tiers, the following updates were made: - Added support for GET requests (and removed POST support for now) - Added support for ``since_id`` and ``until_id`` request parameters. @@ -51,9 +53,9 @@ When migrating this Python search client from an enterprise or premium search en - Dropped --account-type. No longer required since support for Premium and Enterprise search tiers have been dropped. - Dropped --count-bucket. Removed search 'counts' endpoint support. This endpoint is currently not available in Labs. -In this spirit of updating the parlance used, note that the a core method provided by searchtweets/result_stream.py has been renamed. The method gen_rule_payload has been updated to gen_request_parameters. +In this spirit of updating the parlance used, note that a core method provided by searchtweets/result_stream.py has been renamed. The method `gen_rule_payload` has been updated to `gen_request_parameters`. -Finally, the original version of search-tweets-python used a `Tweet Parser `__ to help manage the differences between two different JSON formats ("original" and "Activity Stream"). With Labs, there is just one version of Tweet JSON, so this Tweet Parser is not used. In the original code, this Tweet parser was envoked with a 'tweetify=True' directive. With this Labs version, this use of the Tweet Parser is turned off by instead using 'tweetify=False'. +Finally, the original version of search-tweets-python used a `Tweet Parser `__ to help manage the differences between two different JSON formats ("original" and "Activity Stream"). With Labs, there is just one version of Tweet JSON, so this Tweet Parser is not used. In the original code, this Tweet parser was envoked with a `tweetify=True directive. With this Labs version, this use of the Tweet Parser is turned off by instead using `tweetify=False`. Command-line options From 39ffc32b81d61ff4fad0ed9fdfcf5abdbb216608 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Wed, 12 Aug 2020 11:29:16 -0600 Subject: [PATCH 21/83] JM: v2 is here --- README-labs.rst | 544 ++++++++++++++++++++++++++++++++++++++++++++++++ README.rst | 135 ++++++++---- setup.py | 6 +- 3 files changed, 637 insertions(+), 48 deletions(-) create mode 100644 README-labs.rst diff --git a/README-labs.rst b/README-labs.rst new file mode 100644 index 0000000..7f65fd3 --- /dev/null +++ b/README-labs.rst @@ -0,0 +1,544 @@ +.. image:: https://img.shields.io/static/v1?label=Twitter%20API&message=Developer%20Labs%20v2&color=794BC4&style=flat&logo=Twitter + :target: https://developer.twitter.com/en/docs/labs/overview/versioning + :alt: Labs v2 + +Python client for Labs Recent search +==================================== + +Welcome to the ``labs`` branch of the Python search client. This branch was born from the ``master`` branch that supports premium and enterprise tiers of Twitter search. This branch supports the `Twitter Developer Labs Recent search v2 endpoint `__ only, and drops support for the premium and enterprise tiers. + +Note: If you are looking for the original version that works with premium and enterprise versions of search, head on over to the ``master`` branch. + +This project serves as a wrapper for the Labs Recent search endpoint, providing a command-line utility and a Python library. + +To download and install this package, go to: https://pypi.org/project/searchtweets-labs/ + +Features +======== + +- Supports Labs Recent search, v2. +- Supports a new "polling" mode using the new Labs ``since-id`` search request parameter. The ``since-id``, along with the new ``until-id`` provide a way to navigate the public Tweet archive by Tweet ID. +- Supports additional ways to specify ``start-time`` and ``end-time`` request parameters: + + - d# - For example, 'd2' sets ``start-time`` to (exactly) two days ago. + - h# - For example, 'h12' sets ``start-time`` to (exactly) twelve hours ago. + - m# - For example, 'm15' sets ``start-time`` to (exactly) fifteen minutes ago. + + These are handy for kicking off searches with a backfill period, and also work with the ``end-time`` request parameter. + +These features were inherited from the enterprise/premium version: + +- Command-line utility is pipeable to other tools (e.g., ``jq``). +- Automatically handles pagination of search results with specifiable limits. +- Delivers a stream of data to the user for low in-memory requirements. +- Handles OAuth 2 and Bearer Token authentication. +- Flexible usage within a python program. + + +Labs updates +============ + +Twitter Developer Labs represents an opportunity to apply previous learnings from building Twitter API v1.1. and the premium and enterprise tiers of endpoints, and redesign and rebuild from the ground up. While building this Labs version of the `search-tweets-python` library, we took the opportunity to update fundamental things. This library provides example scripts, and one example is updating their command-line arguments to better match new Labs conventions. Instead of setting search periods with `start-datetime` and `end-datetime`, they have been shortened to match current search request parameters: `start-time` and `end-time`. Throughout the code , we no longer use parlance that references `rules` and `PowerTrack`, and now reference `queries` and the Labs Recent search endpoint. + +When migrating this Python search client to Labs from the enterprise and premium tiers, the following updates were made: + +- Added support for GET requests (and removed POST support for now) +- Added support for ``since_id`` and ``until_id`` request parameters. +- Updated pagination details. +- Updated app command-line parlance + - --start-datetime → --start-time + - --end-datetime → --end-time + - --filter-rule → --query + - --max-results → --max-tweets + - Dropped --account-type. No longer required since support for Premium and Enterprise search tiers have been dropped. + - Dropped --count-bucket. Removed search 'counts' endpoint support. This endpoint is currently not available in Labs. + +In this spirit of updating the parlance used, note that a core method provided by searchtweets/result_stream.py has been renamed. The method `gen_rule_payload` has been updated to `gen_request_parameters`. + +Finally, the original version of search-tweets-python used a `Tweet Parser `__ to help manage the differences between two different JSON formats ("original" and "Activity Stream"). With Labs, there is just one version of Tweet JSON, so this Tweet Parser is not used. In the original code, this Tweet parser was envoked with a `tweetify=True directive. With this Labs version, this use of the Tweet Parser is turned off by instead using `tweetify=False`. + + +Command-line options +===================== + +usage: search_tweets.py + +optional arguments: + -h, --help show this help message and exit + --credential-file CREDENTIAL_FILE + Location of the yaml file used to hold your + credentials. + --credential-file-key CREDENTIAL_YAML_KEY + the key in the credential file used for this session's + credentials. Defaults to search_tweets_api + --env-overwrite ENV_OVERWRITE + Overwrite YAML-parsed credentials with any set + environment variables. See API docs or readme for + details. + --config-file CONFIG_FILENAME + configuration file with all parameters. Far, easier to + use than the command-line args version., If a valid + file is found, all args will be populated, from there. + Remaining command-line args, will overrule args found + in the config, file. + --start-time START_TIME + Start of datetime window, format 'YYYY-mm-DDTHH:MM' + (default: -7 days) + --end-time END_TIME End of datetime window, format 'YYYY-mm-DDTHH:MM' + (default: most recent date) + --query QUERY Search query. (See: + https://developer.twitter.com/en/docs/labs/recent- + search/guides/search-queries) + --since-id SINCE_ID Tweet ID, will start search from Tweets after this + one. (See: + https://developer.twitter.com/en/docs/labs/recent- + search/guides/pagination) + --until-id UNTIL_ID Tweet ID, will end search from Tweets before this one. + (See: + https://developer.twitter.com/en/docs/labs/recent- + search/guides/pagination) + --results-per-call RESULTS_PER_CALL + Number of results to return per call (default 10; max + 100) - corresponds to 'max_results' in the API + --max-tweets MAX_TWEETS + Maximum number of Tweets to return for this session of + requests. + --max-pages MAX_PAGES + Maximum number of pages/API calls to use for this + session. + --results-per-file RESULTS_PER_FILE + Maximum tweets to save per file. + --filename-prefix FILENAME_PREFIX + prefix for the filename where tweet json data will be + stored. + --no-print-stream disable print streaming + --print-stream Print tweet stream to stdout + --extra-headers EXTRA_HEADERS + JSON-formatted str representing a dict of additional + request headers + --debug print all info and warning messages + + +Installation +============= + +The updated Pypi install package for the Labs version is at: + +https://pypi.org/project/searchtweets-labs/ + +Another option to work directly with this code by cloning the repository, installing the required Python packages, setting up your credentials, and start making requests. + +To confirm the your code is ready to go, run the ``$python3 scripts/search-tweets.py -h`` command. You should see the help details shown above. + +Credential Handling +=================== + +The Labs Recent search endpoint uses app-only authentication. You have the choice to configure your application consumer key and secret, or a Bearer Token you have generated. If you supply the application key and secret, the client will generate a Bearer Token for you. + +Many developers might find providing your application key and secret more straightforward and letting this library manage your Bearer Token generation for you. Please see `HERE `_ for an overview of the app-only authentication method. + +We support both YAML-file based methods and environment variables for storing credentials, and provide flexible handling with sensible defaults. + +YAML method +=========== + +The simplest credential file should look like this: + +.. code:: yaml + + search_tweets_api: + endpoint: https://api.twitter.com/labs/2/tweets/search + consumer_key: + consumer_secret: + +By default, this library expects this file at "~/.twitter_keys.yaml", but you can pass the relevant location as needed, either with the --credential-file flag for the command-line app or as demonstrated below in a Python program. + +Both above examples require no special command-line arguments or in-program arguments. The credential parsing methods, unless otherwise specified, will look for a YAML key called search_tweets_api. + +For developers who have multiple endpoints and/or search products, you can keep all credentials in the same file and specify specific keys to use. --credential-file-key specifies this behavior in the command line app. An example: + +.. code:: yaml + + search_tweets_labsv1: + endpoint: https://api.twitter.com/labs/1/tweets/search + consumer_key: + consumer_secret: + (optional) bearer_token: + + search_tweets_labsv2: + endpoint: https://api.twitter.com/labs/2/tweets/search + consumer_key: + consumer_secret: + (optional) bearer_token: + +Environment Variables +===================== + +If you want or need to pass credentials via environment variables, you can set the appropriate variables: + +:: + + export SEARCHTWEETS_ENDPOINT= + export SEARCHTWEETS_BEARER_TOKEN= + export SEARCHTWEETS_CONSUMER_KEY= + export SEARCHTWEETS_CONSUMER_SECRET= + +The ``load_credentials`` function will attempt to find these variables if it cannot load fields from the YAML file, and it will **overwrite any credentials from the YAML file that are present as environment variables** if they have been parsed. This behavior can be changed by setting the ``load_credentials`` parameter ``env_overwrite`` to ``False``. + +The following cells demonstrates credential handling in the Python library. + +.. code:: python + + from searchtweets import load_credentials + +.. code:: python + + load_credentials(filename="./search_tweets_creds_example.yaml", + yaml_key="search_tweets_v2_example", + env_overwrite=False) + +:: + + {'bearer_token': '', + 'endpoint': 'https://api.twitter.com/labs/2/tweets/search', + 'extra_headers_dict': None} + +Environment Variable Overrides +------------------------------ + +If we set our environment variables, the program will look for them +regardless of a YAML file's validity or existence. + +.. code:: python + + import os + os.environ["SEARCHTWEETS_BEARER_TOKEN"] = "" + os.environ["SEARCHTWEETS_ENDPOINT"] = "" + + load_credentials(filename="nothing_here.yaml", yaml_key="no_key_here") + +:: + + cannot read file nothing_here.yaml + Error parsing YAML file; searching for valid environment variables + +:: + + {'bearer_token': '', + 'endpoint': ''} + +Command-line app +---------------- + +the flags: + +- ``--credential-file `` +- ``--credential-file-key `` +- ``--env-overwrite`` + +are used to control credential behavior from the command-line app. + +-------------- + +Using the Comand Line Application +================================= + +The library includes an application, ``search_tweets.py``, that provides rapid access to Tweets. When you use ``pip`` to install this package, ``search_tweets.py`` is installed globally. The file is located in the ``scripts/`` directory for those who want to run it locally. + +Note that the ``--results-per-call`` flag specifies an argument to the API, not as a hard max to number of results returned from this program. The argument ``--max-tweets`` defines the maximum number of results to return from a single run of the ``search-tweets.py``` script. All examples assume that your credentials are set up correctly in the default location - ``.twitter_keys.yaml`` or in environment variables. + +**Stream json results to stdout without saving** + +.. code:: bash + + search_tweets.py \ + --max-tweets 1000 \ + --results-per-call 100 \ + --query "(snow OR rain) has:media -is:retweet" \ + --print-stream + +**Stream json results to stdout and save to a file** + +.. code:: bash + + search_tweets.py \ + --max-results 1000 \ + --results-per-call 100 \ + --query "(snow OR rain) has:media -is:retweet" \ + --filename-prefix beyonce_geo \ + --print-stream + +**Save to file without output** + +.. code:: bash + + search_tweets.py \ + --max-results 100 \ + --results-per-call 100 \ + --query "(snow OR rain) has:media -is:retweet" \ + --filename-prefix weather_pic \ + --no-print-stream + +One or more custom headers can be specified from the command line, using the ``--extra-headers`` argument and a JSON-formatted string representing a dictionary of extra headers: + +.. code:: bash + + search_tweets.py \ + --query "(snow OR rain) has:media -is:retweet" \ + --extra-headers '{"":""}' + +Options can be passed via a configuration file (either ini or YAML). Example files can be found in the ``config/api_config_example.config`` or ``config/api_yaml_example.yaml`` files, which might look like this: + +.. code:: bash + + [search_rules] + start_time = 2020-05-01 + end_time = 2020-05-01 + query = (snow OR rain) has:media -is:retweet + + [search_params] + results_per_call = 100 + max_tweets = 10000 + + [output_params] + save_file = True + filename_prefix = weather-pics + results_per_file = 10000000 + +Or this: + +.. code:: bash + + search_rules: + start_time: 2020-05-01 + end_time: 2020-05-01 01:01 + query: (snow OR rain) has:media -is:retweet + + search_params: + results_per_call: 100 + max_results: 500 + + output_params: + save_file: True + filename_prefix: (snow OR rain) has:media -is:retweet + results_per_file: 10000000 + +Custom headers can be specified in a config file, under a specific credentials key: + +.. code:: yaml + + search_tweets_api: + endpoint: + bearer_token: + extra_headers: + : + +When using a config file in conjunction with the command-line utility, you need to specify your config file via the ``--config-file`` parameter. Additional command-line arguments will either be added to the config file args or overwrite the config file args if both are specified and present. + +Example: + +:: + + search_tweets.py \ + --config-file myapiconfig.config \ + --no-print-stream + +------------------ + +Using the Twitter Search APIs' Python Wrapper +============================================= + +Working with the API within a Python program is straightforward. + +We'll assume that credentials are in the default location, +``~/.twitter_keys.yaml``. + +.. code:: python + + from searchtweets import ResultStream, gen_request_parameters, load_credentials + + +Labs Setup +------------- + +.. code:: python + + labs_search_args = load_credentials("~/.twitter_keys.yaml", + yaml_key="search_tweets_labs", + env_overwrite=False) + + +There is a function that formats search API rules into valid json queries called ``gen_request_parameters``. It has sensible defaults, such as pulling more Tweets per call than the default 10, and not including dates. Discussing the finer points of +generating search rules is out of scope for these examples; we encourage you to see the docs to learn the nuances within, but for now let's see what a query looks like. + +.. code:: python + + rule = gen_request_requests("snow", results_per_call=100) + print(rule) + +:: + + {"query":"snow","max_results":100} + +This rule will match tweets that have the text ``snow`` in them. + +From this point, there are two ways to interact with the API. There is a quick method to collect smaller amounts of Tweets to memory that requires less thought and knowledge, and interaction with the ``ResultStream`` object which will be introduced later. + +Fast Way +-------- + +We'll use the ``search_args`` variable to power the configuration point for the API. The object also takes a valid search query and has options to cutoff search when hitting limits on both number of Tweets and endpoint calls. + +We'll be using the ``collect_results`` function, which has three parameters. + +- query: a valid search query, referenced earlier +- max_results: as the API handles pagination, it will stop collecting + when we get to this number +- result_stream_args: configuration args that we've already specified. + +Let's see how it goes: + +.. code:: python + + from searchtweets import collect_results + +.. code:: python + + tweets = collect_results(query, + max_results=100, + result_stream_args=labs_search_args) # change this if you need to + +An overwhelming number of Tweet attributes are made available directly, as such: + +.. code:: python + + [print(tweet.text, end='\n\n') for tweet in tweets[0:10]]; + +:: + + @CleoLoughlin Rain after the snow? Do you have ice now? + + @koofltxr Rain, 134340, still with you, winter bear, Seoul, crystal snow, sea, outro:blueside + + @TheWxMeister Sorry it ruined your camping. I was covering plants in case we got snow in the Mountain Shadows area. Thankfully we didn\u2019t. At least it didn\u2019t stick to the ground. The wind was crazy! Got just over an inch of rain. Looking forward to better weather. + + @brettlorenzen And, the reliability of \u201cNeither snow nor rain nor heat nor gloom of night stays these couriers (the #USPS) from the swift completion of their appointed rounds.\u201d + + Because black people get killed in the rain, black lives matter in the rain. It matters all the time. Snow, rain, sleet, sunny days. We're not out here because it's sunny. We're not out here for fun. We're out here because black lives matter. + + Some of the master copies of the film \u201cGone With the Wind\u201d are archived at the @librarycongress near \u201cSnow White and the Seven Dwarfs\u201d and \u201cSingin\u2019 in the Rain.\u201d GWTW isn\u2019t going to vanish off the face of the earth. + + Snow Man\u306eD.D.\u3068\nSixTONES\u306eImitation Rain\n\u6d41\u308c\u305f\u301c + + @Nonvieta Yup I work in the sanitation industry. I'm in the office however. Life would not go on without our garbage men and women out there. All day everyday rain snow or shine they out there. + + This picture of a rainbow in WA proves nothing. How do we know if this rainbow was not on Mars or the ISS? Maybe it was drawn in on the picture. WA has mail-in voting so we do have to worry aboug rain, snow, poll workers not showing up or voting machines broke on election day !! https://t.co/5WdHx0acS0 https://t.co/BEKtTpBW9g + + Weather in Oslo at 06:00: Clear Temp: 10.6\u00b0C Min today: 9.1\u00b0C Rain today:0.0mm Snow now: 0.0cm Wind N Conditions: Clear Daylight:18:39 hours Sunset: 22:36 + +Voila, we have some Tweets. For interactive environments and other cases where you don't care about collecting your data in a single load or don't need to operate on the stream of Tweets directly, I recommend using this convenience function. + +Working with the ResultStream +----------------------------- + +The ResultStream object will be powered by the ``search_args``, and takes the query and other configuration parameters, including a hard stop on number of pages to limit your API call usage. + +.. code:: python + + rs = ResultStream(query=query, + max_results=500, + max_pages=1, + **labs_search_args) + + print(rs) + + :: + + ResultStream: + { + "endpoint":"https:\/\/fanyv88.com:443\/https\/api.twitter.com\/labs\/2\/tweets\/search", + "request_parameters":{ + "query":"snow", + "max_results":100 + }, + "tweetify":false, + "max_results":1000 + } + +There is a function, ``.stream``, that seamlessly handles requests and pagination for a given query. It returns a generator, and to grab our 1000 Tweets that mention ``snow`` we can do this: + +.. code:: python + + tweets = list(rs.stream()) + +.. code:: python + + # using unidecode to prevent emoji/accents printing + [print(tweet) for tweet in tweets[0:10]]; + +:: + +{"id": "1270572563505254404", "text": "@CleoLoughlin Rain after the snow? Do you have ice now?"} +{"id": "1270570767038599168", "text": "@koofltxr Rain, 134340, still with you, winter bear, Seoul, crystal snow, sea, outro:blueside"} +{"id": "1270570621282340864", "text": "@TheWxMeister Sorry it ruined your camping. I was covering plants in case we got snow in the Mountain Shadows area. Thankfully we didn\u2019t. At least it didn\u2019t stick to the ground. The wind was crazy! Got just over an inch of rain. Looking forward to better weather."} +{"id": "1270569070287630337", "text": "@brettlorenzen And, the reliability of \u201cNeither snow nor rain nor heat nor gloom of night stays these couriers (the #USPS) from the swift completion of their appointed rounds.\u201d"} +{"id": "1270568690447257601", "text": "\"Because black people get killed in the rain, black lives matter in the rain. It matters all the time. Snow, rain, sleet, sunny days. We're not out here because it's sunny. We're not out here for fun. We're out here because black lives matter.\" @wisn12news https://t.co/3kZZ7q2MR9"} +{"id": "1270568607605575680", "text": "Some of the master copies of the film \u201cGone With the Wind\u201d are archived at the @librarycongress near \u201cSnow White and the Seven Dwarfs\u201d and \u201cSingin\u2019 in the Rain.\u201d GWTW isn\u2019t going to vanish off the face of the earth."} +{"id": "1270568437916426240", "text": "Snow Man\u306eD.D.\u3068\nSixTONES\u306eImitation Rain\n\u6d41\u308c\u305f\u301c"} +{"id": "1270568195519373313", "text": "@Nonvieta Yup I work in the sanitation industry. I'm in the office however. Life would not go on without our garbage men and women out there. All day everyday rain snow or shine they out there."} +{"id": "1270567737283117058", "text": "This picture of a rainbow in WA proves nothing. How do we know if this rainbow was not on Mars or the ISS? Maybe it was drawn in on the picture. WA has mail-in voting so we do have to worry aboug rain, snow, poll workers not showing up or voting machines broke on election day !! https://t.co/5WdHx0acS0 https://t.co/BEKtTpBW9g"} +{"id": "1270566386524356608", "text": "Weather in Oslo at 06:00: Clear Temp: 10.6\u00b0C Min today: 9.1\u00b0C Rain today:0.0mm Snow now: 0.0cm Wind N Conditions: Clear Daylight:18:39 hours Sunset: 22:36"} + +Contributing +============ + +Any contributions should follow the following pattern: + +1. Make a feature or bugfix branch, e.g., + ``git checkout -b my_new_feature`` +2. Make your changes in that branch +3. Ensure you bump the version number in ``searchtweets/_version.py`` to + reflect your changes. We use `Semantic + Versioning `__, so non-breaking enhancements + should increment the minor version, e.g., ``1.5.0 -> 1.6.0``, and + bugfixes will increment the last version, ``1.6.0 -> 1.6.1``. +4. Create a pull request + +After the pull request process is accepted, package maintainers will +handle building documentation and distribution to Pypi. + +For reference, distributing to Pypi is accomplished by the following +commands, ran from the root directory in the repo: + +.. code:: bash + + python setup.py bdist_wheel + python setup.py sdist + twine upload dist/* + +How to build the documentation: + +Building the documentation requires a few Sphinx packages to build the +webpages: + +.. code:: bash + + pip install sphinx + pip install sphinx_bootstrap_theme + pip install sphinxcontrib-napoleon + +Then (once your changes are committed to master) you should be able to +run the documentation-generating bash script and follow the +instructions: + +.. code:: bash + + bash build_sphinx_docs.sh master searchtweets + +Note that this README is also generated, and so after any README changes +you'll need to re-build the README (you need pandoc version 2.1+ for +this) and commit the result: + +.. code:: bash + + bash make_readme.sh diff --git a/README.rst b/README.rst index 7f65fd3..a17c6f1 100644 --- a/README.rst +++ b/README.rst @@ -1,23 +1,27 @@ -.. image:: https://img.shields.io/static/v1?label=Twitter%20API&message=Developer%20Labs%20v2&color=794BC4&style=flat&logo=Twitter - :target: https://developer.twitter.com/en/docs/labs/overview/versioning - :alt: Labs v2 +.. image:: https://img.shields.io/endpoint?url=https%3A%2F%2Ffanyv88.com%3A443%2Fhttps%2Ftwbadges.glitch.me%2Fbadges%2Fv2 + :target: https://developer.twitter.com/en/docs/twitter-api + :alt: Twitter API v2 -Python client for Labs Recent search -==================================== +Python client for the Twitter API v2 recent search endpoint +=========================================================== + +Welcome to the ``v2`` branch of the Python search client. This branch was born from the main branch that supports +premium and enterprise tiers of Twitter search. This branch supports the `Twitter API v2 recent search endpoint `__ only, and drops support for the premium and enterprise tiers. -Welcome to the ``labs`` branch of the Python search client. This branch was born from the ``master`` branch that supports premium and enterprise tiers of Twitter search. This branch supports the `Twitter Developer Labs Recent search v2 endpoint `__ only, and drops support for the premium and enterprise tiers. +This project serves as a wrapper for the Twitter API v2 recent search endpoint, providing a command-line utility and a Python library. -Note: If you are looking for the original version that works with premium and enterprise versions of search, head on over to the ``master`` branch. +To download and install this package, go to: https://pypi.org/project/searchtweets-v2/ -This project serves as a wrapper for the Labs Recent search endpoint, providing a command-line utility and a Python library. +If you are looking for the original version that works with premium and enterprise versions of search, head on over to +the main or ``enterprise-premium`` branch. (Soon, the v2 version will be promoted to the main branch.) -To download and install this package, go to: https://pypi.org/project/searchtweets-labs/ Features ======== -- Supports Labs Recent search, v2. -- Supports a new "polling" mode using the new Labs ``since-id`` search request parameter. The ``since-id``, along with the new ``until-id`` provide a way to navigate the public Tweet archive by Tweet ID. +- Supports Twitter API v2 recent search. +- Supports the configuration of v2 `expansions `_ and `fields `_. +- Supports a new "polling" mode using the ``since-id`` search request parameter. The ``since-id``, along with the new ``until-id`` provide a way to navigate the public Tweet archive by Tweet ID. - Supports additional ways to specify ``start-time`` and ``end-time`` request parameters: - d# - For example, 'd2' sets ``start-time`` to (exactly) two days ago. @@ -35,34 +39,57 @@ These features were inherited from the enterprise/premium version: - Flexible usage within a python program. -Labs updates -============ +Twitter API v2 recent search updates +==================================== -Twitter Developer Labs represents an opportunity to apply previous learnings from building Twitter API v1.1. and the premium and enterprise tiers of endpoints, and redesign and rebuild from the ground up. While building this Labs version of the `search-tweets-python` library, we took the opportunity to update fundamental things. This library provides example scripts, and one example is updating their command-line arguments to better match new Labs conventions. Instead of setting search periods with `start-datetime` and `end-datetime`, they have been shortened to match current search request parameters: `start-time` and `end-time`. Throughout the code , we no longer use parlance that references `rules` and `PowerTrack`, and now reference `queries` and the Labs Recent search endpoint. +Twitter API v2 represents an opportunity to apply previous learnings from building Twitter API v1.1. and the premium and enterprise tiers of endpoints, and redesign and rebuild from the ground up. While building this v2 version of the `search-tweets-python` library, +we took the opportunity to update fundamental things. This library provides example scripts, and one example is updating their command-line arguments to better match new v2 conventions. Instead of setting search periods with `start-datetime` and `end-datetime`, +they have been shortened to match current search request parameters: `start-time` and `end-time`. Throughout the code, we no longer use parlance that references `rules` and `PowerTrack`, and now reference `queries` and the v2 recent search endpoint. -When migrating this Python search client to Labs from the enterprise and premium tiers, the following updates were made: +When migrating this Python search client to v2 from the enterprise and premium tiers, the following updates were made: -- Added support for GET requests (and removed POST support for now) +- Added support for GET requests (and removed POST support for now). - Added support for ``since_id`` and ``until_id`` request parameters. - Updated pagination details. -- Updated app command-line parlance +- Updated app command-line parlance: - --start-datetime → --start-time - --end-datetime → --end-time - --filter-rule → --query - --max-results → --max-tweets - Dropped --account-type. No longer required since support for Premium and Enterprise search tiers have been dropped. - - Dropped --count-bucket. Removed search 'counts' endpoint support. This endpoint is currently not available in Labs. + - Dropped --count-bucket. Removed search 'counts' endpoint support. This endpoint is currently not available in v2. In this spirit of updating the parlance used, note that a core method provided by searchtweets/result_stream.py has been renamed. The method `gen_rule_payload` has been updated to `gen_request_parameters`. -Finally, the original version of search-tweets-python used a `Tweet Parser `__ to help manage the differences between two different JSON formats ("original" and "Activity Stream"). With Labs, there is just one version of Tweet JSON, so this Tweet Parser is not used. In the original code, this Tweet parser was envoked with a `tweetify=True directive. With this Labs version, this use of the Tweet Parser is turned off by instead using `tweetify=False`. +Finally, the original version of search-tweets-python used a `Tweet Parser `__ to help manage the differences between two different JSON formats ("original" and "Activity Stream"). With v2, there is just one version of Tweet JSON, so this Tweet Parser is not used. +In the original code, this Tweet parser was envoked with a `tweetify=True directive. With this v2 version, this use of the Tweet Parser is turned off by instead using `tweetify=False`. Command-line options -===================== +==================== usage: search_tweets.py +Connected to pydev debugger (build 173.4674.33) +usage: search_tweets.py [-h] [--credential-file CREDENTIAL_FILE] + [--credential-file-key CREDENTIAL_YAML_KEY] + [--env-overwrite ENV_OVERWRITE] + [--config-file CONFIG_FILENAME] [--query QUERY] + [--start-time START_TIME] [--end-time END_TIME] + [--since-id SINCE_ID] [--until-id UNTIL_ID] + [--results-per-call RESULTS_PER_CALL] + [--expansions EXPANSIONS] + [--tweet-fields TWEET_FIELDS] + [--user-fields USER_FIELDS] + [--media-fields MEDIA_FIELDS] + [--place-fields PLACE_FIELDS] + [--poll-fields POLL_FIELDS] [--max-tweets MAX_TWEETS] + [--max-pages MAX_PAGES] + [--results-per-file RESULTS_PER_FILE] + [--filename-prefix FILENAME_PREFIX] + [--no-print-stream] [--print-stream] + [--extra-headers EXTRA_HEADERS] [--debug] + optional arguments: -h, --help show this help message and exit --credential-file CREDENTIAL_FILE @@ -81,25 +108,42 @@ optional arguments: file is found, all args will be populated, from there. Remaining command-line args, will overrule args found in the config, file. + --query QUERY Search query. (See: + https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-rule) --start-time START_TIME Start of datetime window, format 'YYYY-mm-DDTHH:MM' (default: -7 days) --end-time END_TIME End of datetime window, format 'YYYY-mm-DDTHH:MM' (default: most recent date) - --query QUERY Search query. (See: - https://developer.twitter.com/en/docs/labs/recent- - search/guides/search-queries) --since-id SINCE_ID Tweet ID, will start search from Tweets after this one. (See: - https://developer.twitter.com/en/docs/labs/recent- - search/guides/pagination) + https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/paginate) --until-id UNTIL_ID Tweet ID, will end search from Tweets before this one. (See: - https://developer.twitter.com/en/docs/labs/recent- - search/guides/pagination) + https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/paginate) --results-per-call RESULTS_PER_CALL Number of results to return per call (default 10; max 100) - corresponds to 'max_results' in the API + --expansions EXPANSIONS + A comma-delimited list of object expansions to include + in endpoint responses. (API default: "") + --tweet-fields TWEET_FIELDS + A comma-delimited list of Tweet JSON attributions to + include in endpoint responses. (API default: + "id,text") + --user-fields USER_FIELDS + A comma-delimited list of user JSON attributions to + include in endpoint responses. (API default: "id") + --media-fields MEDIA_FIELDS + A comma-delimited list of media JSON attributions to + include in endpoint responses. (API default: "id") + --place-fields PLACE_FIELDS + A comma-delimited list of Twitter Place JSON + attributions to include in endpoint responses. (API + default: "id") + --poll-fields POLL_FIELDS + A comma-delimited list of Tweet Poll JSON attributions + to include in endpoint responses. (API default: "id") --max-tweets MAX_TWEETS Maximum number of Tweets to return for this session of requests. @@ -115,16 +159,16 @@ optional arguments: --print-stream Print tweet stream to stdout --extra-headers EXTRA_HEADERS JSON-formatted str representing a dict of additional - request headers + HTTP request headers --debug print all info and warning messages Installation ============= -The updated Pypi install package for the Labs version is at: +The updated Pypi install package for the v2 version is at: -https://pypi.org/project/searchtweets-labs/ +https://pypi.org/project/searchtweets-v2/ Another option to work directly with this code by cloning the repository, installing the required Python packages, setting up your credentials, and start making requests. @@ -133,7 +177,7 @@ To confirm the your code is ready to go, run the ``$python3 scripts/search-tweet Credential Handling =================== -The Labs Recent search endpoint uses app-only authentication. You have the choice to configure your application consumer key and secret, or a Bearer Token you have generated. If you supply the application key and secret, the client will generate a Bearer Token for you. +The Twitter API v2 recent search endpoint uses app-only authentication. You have the choice to configure your application consumer key and secret, or a Bearer Token you have generated. If you supply the application key and secret, the client will generate a Bearer Token for you. Many developers might find providing your application key and secret more straightforward and letting this library manage your Bearer Token generation for you. Please see `HERE `_ for an overview of the app-only authentication method. @@ -147,9 +191,10 @@ The simplest credential file should look like this: .. code:: yaml search_tweets_api: - endpoint: https://api.twitter.com/labs/2/tweets/search + endpoint: https://api.twitter.com/2/tweets/search/recent consumer_key: consumer_secret: + bearer_token: By default, this library expects this file at "~/.twitter_keys.yaml", but you can pass the relevant location as needed, either with the --credential-file flag for the command-line app or as demonstrated below in a Python program. @@ -159,8 +204,8 @@ For developers who have multiple endpoints and/or search products, you can keep .. code:: yaml - search_tweets_labsv1: - endpoint: https://api.twitter.com/labs/1/tweets/search + search_tweets_v2: + endpoint: https://api.twitter.com/2/tweets/search/recent consumer_key: consumer_secret: (optional) bearer_token: @@ -200,7 +245,7 @@ The following cells demonstrates credential handling in the Python library. :: {'bearer_token': '', - 'endpoint': 'https://api.twitter.com/labs/2/tweets/search', + 'endpoint': 'https://api.twitter.com/2/tweets/search/recent', 'extra_headers_dict': None} Environment Variable Overrides @@ -252,7 +297,7 @@ Note that the ``--results-per-call`` flag specifies an argument to the API, not .. code:: bash search_tweets.py \ - --max-tweets 1000 \ + --max-tweets 10000 \ --results-per-call 100 \ --query "(snow OR rain) has:media -is:retweet" \ --print-stream @@ -262,7 +307,7 @@ Note that the ``--results-per-call`` flag specifies an argument to the API, not .. code:: bash search_tweets.py \ - --max-results 1000 \ + --max-tweets 10000 \ --results-per-call 100 \ --query "(snow OR rain) has:media -is:retweet" \ --filename-prefix beyonce_geo \ @@ -273,7 +318,7 @@ Note that the ``--results-per-call`` flag specifies an argument to the API, not .. code:: bash search_tweets.py \ - --max-results 100 \ + --max-tweets 10000 \ --results-per-call 100 \ --query "(snow OR rain) has:media -is:retweet" \ --filename-prefix weather_pic \ @@ -358,13 +403,13 @@ We'll assume that credentials are in the default location, from searchtweets import ResultStream, gen_request_parameters, load_credentials -Labs Setup -------------- +Twitter API v2 Setup +-------------------- .. code:: python - labs_search_args = load_credentials("~/.twitter_keys.yaml", - yaml_key="search_tweets_labs", + search_args = load_credentials("~/.twitter_keys.yaml", + yaml_key="search_tweets_v2", env_overwrite=False) @@ -406,7 +451,7 @@ Let's see how it goes: tweets = collect_results(query, max_results=100, - result_stream_args=labs_search_args) # change this if you need to + result_stream_args=search_args) # change this if you need to An overwhelming number of Tweet attributes are made available directly, as such: @@ -448,7 +493,7 @@ The ResultStream object will be powered by the ``search_args``, and takes the qu rs = ResultStream(query=query, max_results=500, max_pages=1, - **labs_search_args) + **search_args) print(rs) @@ -456,7 +501,7 @@ The ResultStream object will be powered by the ``search_args``, and takes the qu ResultStream: { - "endpoint":"https:\/\/fanyv88.com:443\/https\/api.twitter.com\/labs\/2\/tweets\/search", + "endpoint":"https:\/\/fanyv88.com:443\/https\/api.twitter.com\/2\/tweets\/search\/recent", "request_parameters":{ "query":"snow", "max_results":100 diff --git a/setup.py b/setup.py index cfd8c14..bb7047d 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def parse_version(str_): VERSION = parse_version(_version_line) setup(name='searchtweets-labs', - description="Wrapper for Twitter Developer Labs Recent search endpoint.", + description="Wrapper for Twitter API v2 recent search endpoint.", url='https://github.com/twitterdev/search-tweets-python', author='Fiona Pigott, Jeff Kolb, Josh Montague, Aaron Gonzales, Jim Moffitt', long_description=open('README.rst', 'r', encoding="utf-8").read(), @@ -31,7 +31,7 @@ def parse_version(str_): license='MIT', version=VERSION, python_requires='>=3.3', - install_requires=["requests", "tweet_parser", "pyyaml"], + install_requires=["requests", "pyyaml"], packages=find_packages(), - scripts=["scripts/search_tweets.py","scripts/poll_tweets.py"], + scripts=["scripts/search.py","scripts/polling.py"], ) From 2be6199083d15c72a06071863d9ace85a842dc1f Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Wed, 12 Aug 2020 11:41:10 -0600 Subject: [PATCH 22/83] Delete README-labs.rst --- README-labs.rst | 544 ------------------------------------------------ 1 file changed, 544 deletions(-) delete mode 100644 README-labs.rst diff --git a/README-labs.rst b/README-labs.rst deleted file mode 100644 index 7f65fd3..0000000 --- a/README-labs.rst +++ /dev/null @@ -1,544 +0,0 @@ -.. image:: https://img.shields.io/static/v1?label=Twitter%20API&message=Developer%20Labs%20v2&color=794BC4&style=flat&logo=Twitter - :target: https://developer.twitter.com/en/docs/labs/overview/versioning - :alt: Labs v2 - -Python client for Labs Recent search -==================================== - -Welcome to the ``labs`` branch of the Python search client. This branch was born from the ``master`` branch that supports premium and enterprise tiers of Twitter search. This branch supports the `Twitter Developer Labs Recent search v2 endpoint `__ only, and drops support for the premium and enterprise tiers. - -Note: If you are looking for the original version that works with premium and enterprise versions of search, head on over to the ``master`` branch. - -This project serves as a wrapper for the Labs Recent search endpoint, providing a command-line utility and a Python library. - -To download and install this package, go to: https://pypi.org/project/searchtweets-labs/ - -Features -======== - -- Supports Labs Recent search, v2. -- Supports a new "polling" mode using the new Labs ``since-id`` search request parameter. The ``since-id``, along with the new ``until-id`` provide a way to navigate the public Tweet archive by Tweet ID. -- Supports additional ways to specify ``start-time`` and ``end-time`` request parameters: - - - d# - For example, 'd2' sets ``start-time`` to (exactly) two days ago. - - h# - For example, 'h12' sets ``start-time`` to (exactly) twelve hours ago. - - m# - For example, 'm15' sets ``start-time`` to (exactly) fifteen minutes ago. - - These are handy for kicking off searches with a backfill period, and also work with the ``end-time`` request parameter. - -These features were inherited from the enterprise/premium version: - -- Command-line utility is pipeable to other tools (e.g., ``jq``). -- Automatically handles pagination of search results with specifiable limits. -- Delivers a stream of data to the user for low in-memory requirements. -- Handles OAuth 2 and Bearer Token authentication. -- Flexible usage within a python program. - - -Labs updates -============ - -Twitter Developer Labs represents an opportunity to apply previous learnings from building Twitter API v1.1. and the premium and enterprise tiers of endpoints, and redesign and rebuild from the ground up. While building this Labs version of the `search-tweets-python` library, we took the opportunity to update fundamental things. This library provides example scripts, and one example is updating their command-line arguments to better match new Labs conventions. Instead of setting search periods with `start-datetime` and `end-datetime`, they have been shortened to match current search request parameters: `start-time` and `end-time`. Throughout the code , we no longer use parlance that references `rules` and `PowerTrack`, and now reference `queries` and the Labs Recent search endpoint. - -When migrating this Python search client to Labs from the enterprise and premium tiers, the following updates were made: - -- Added support for GET requests (and removed POST support for now) -- Added support for ``since_id`` and ``until_id`` request parameters. -- Updated pagination details. -- Updated app command-line parlance - - --start-datetime → --start-time - - --end-datetime → --end-time - - --filter-rule → --query - - --max-results → --max-tweets - - Dropped --account-type. No longer required since support for Premium and Enterprise search tiers have been dropped. - - Dropped --count-bucket. Removed search 'counts' endpoint support. This endpoint is currently not available in Labs. - -In this spirit of updating the parlance used, note that a core method provided by searchtweets/result_stream.py has been renamed. The method `gen_rule_payload` has been updated to `gen_request_parameters`. - -Finally, the original version of search-tweets-python used a `Tweet Parser `__ to help manage the differences between two different JSON formats ("original" and "Activity Stream"). With Labs, there is just one version of Tweet JSON, so this Tweet Parser is not used. In the original code, this Tweet parser was envoked with a `tweetify=True directive. With this Labs version, this use of the Tweet Parser is turned off by instead using `tweetify=False`. - - -Command-line options -===================== - -usage: search_tweets.py - -optional arguments: - -h, --help show this help message and exit - --credential-file CREDENTIAL_FILE - Location of the yaml file used to hold your - credentials. - --credential-file-key CREDENTIAL_YAML_KEY - the key in the credential file used for this session's - credentials. Defaults to search_tweets_api - --env-overwrite ENV_OVERWRITE - Overwrite YAML-parsed credentials with any set - environment variables. See API docs or readme for - details. - --config-file CONFIG_FILENAME - configuration file with all parameters. Far, easier to - use than the command-line args version., If a valid - file is found, all args will be populated, from there. - Remaining command-line args, will overrule args found - in the config, file. - --start-time START_TIME - Start of datetime window, format 'YYYY-mm-DDTHH:MM' - (default: -7 days) - --end-time END_TIME End of datetime window, format 'YYYY-mm-DDTHH:MM' - (default: most recent date) - --query QUERY Search query. (See: - https://developer.twitter.com/en/docs/labs/recent- - search/guides/search-queries) - --since-id SINCE_ID Tweet ID, will start search from Tweets after this - one. (See: - https://developer.twitter.com/en/docs/labs/recent- - search/guides/pagination) - --until-id UNTIL_ID Tweet ID, will end search from Tweets before this one. - (See: - https://developer.twitter.com/en/docs/labs/recent- - search/guides/pagination) - --results-per-call RESULTS_PER_CALL - Number of results to return per call (default 10; max - 100) - corresponds to 'max_results' in the API - --max-tweets MAX_TWEETS - Maximum number of Tweets to return for this session of - requests. - --max-pages MAX_PAGES - Maximum number of pages/API calls to use for this - session. - --results-per-file RESULTS_PER_FILE - Maximum tweets to save per file. - --filename-prefix FILENAME_PREFIX - prefix for the filename where tweet json data will be - stored. - --no-print-stream disable print streaming - --print-stream Print tweet stream to stdout - --extra-headers EXTRA_HEADERS - JSON-formatted str representing a dict of additional - request headers - --debug print all info and warning messages - - -Installation -============= - -The updated Pypi install package for the Labs version is at: - -https://pypi.org/project/searchtweets-labs/ - -Another option to work directly with this code by cloning the repository, installing the required Python packages, setting up your credentials, and start making requests. - -To confirm the your code is ready to go, run the ``$python3 scripts/search-tweets.py -h`` command. You should see the help details shown above. - -Credential Handling -=================== - -The Labs Recent search endpoint uses app-only authentication. You have the choice to configure your application consumer key and secret, or a Bearer Token you have generated. If you supply the application key and secret, the client will generate a Bearer Token for you. - -Many developers might find providing your application key and secret more straightforward and letting this library manage your Bearer Token generation for you. Please see `HERE `_ for an overview of the app-only authentication method. - -We support both YAML-file based methods and environment variables for storing credentials, and provide flexible handling with sensible defaults. - -YAML method -=========== - -The simplest credential file should look like this: - -.. code:: yaml - - search_tweets_api: - endpoint: https://api.twitter.com/labs/2/tweets/search - consumer_key: - consumer_secret: - -By default, this library expects this file at "~/.twitter_keys.yaml", but you can pass the relevant location as needed, either with the --credential-file flag for the command-line app or as demonstrated below in a Python program. - -Both above examples require no special command-line arguments or in-program arguments. The credential parsing methods, unless otherwise specified, will look for a YAML key called search_tweets_api. - -For developers who have multiple endpoints and/or search products, you can keep all credentials in the same file and specify specific keys to use. --credential-file-key specifies this behavior in the command line app. An example: - -.. code:: yaml - - search_tweets_labsv1: - endpoint: https://api.twitter.com/labs/1/tweets/search - consumer_key: - consumer_secret: - (optional) bearer_token: - - search_tweets_labsv2: - endpoint: https://api.twitter.com/labs/2/tweets/search - consumer_key: - consumer_secret: - (optional) bearer_token: - -Environment Variables -===================== - -If you want or need to pass credentials via environment variables, you can set the appropriate variables: - -:: - - export SEARCHTWEETS_ENDPOINT= - export SEARCHTWEETS_BEARER_TOKEN= - export SEARCHTWEETS_CONSUMER_KEY= - export SEARCHTWEETS_CONSUMER_SECRET= - -The ``load_credentials`` function will attempt to find these variables if it cannot load fields from the YAML file, and it will **overwrite any credentials from the YAML file that are present as environment variables** if they have been parsed. This behavior can be changed by setting the ``load_credentials`` parameter ``env_overwrite`` to ``False``. - -The following cells demonstrates credential handling in the Python library. - -.. code:: python - - from searchtweets import load_credentials - -.. code:: python - - load_credentials(filename="./search_tweets_creds_example.yaml", - yaml_key="search_tweets_v2_example", - env_overwrite=False) - -:: - - {'bearer_token': '', - 'endpoint': 'https://api.twitter.com/labs/2/tweets/search', - 'extra_headers_dict': None} - -Environment Variable Overrides ------------------------------- - -If we set our environment variables, the program will look for them -regardless of a YAML file's validity or existence. - -.. code:: python - - import os - os.environ["SEARCHTWEETS_BEARER_TOKEN"] = "" - os.environ["SEARCHTWEETS_ENDPOINT"] = "" - - load_credentials(filename="nothing_here.yaml", yaml_key="no_key_here") - -:: - - cannot read file nothing_here.yaml - Error parsing YAML file; searching for valid environment variables - -:: - - {'bearer_token': '', - 'endpoint': ''} - -Command-line app ----------------- - -the flags: - -- ``--credential-file `` -- ``--credential-file-key `` -- ``--env-overwrite`` - -are used to control credential behavior from the command-line app. - --------------- - -Using the Comand Line Application -================================= - -The library includes an application, ``search_tweets.py``, that provides rapid access to Tweets. When you use ``pip`` to install this package, ``search_tweets.py`` is installed globally. The file is located in the ``scripts/`` directory for those who want to run it locally. - -Note that the ``--results-per-call`` flag specifies an argument to the API, not as a hard max to number of results returned from this program. The argument ``--max-tweets`` defines the maximum number of results to return from a single run of the ``search-tweets.py``` script. All examples assume that your credentials are set up correctly in the default location - ``.twitter_keys.yaml`` or in environment variables. - -**Stream json results to stdout without saving** - -.. code:: bash - - search_tweets.py \ - --max-tweets 1000 \ - --results-per-call 100 \ - --query "(snow OR rain) has:media -is:retweet" \ - --print-stream - -**Stream json results to stdout and save to a file** - -.. code:: bash - - search_tweets.py \ - --max-results 1000 \ - --results-per-call 100 \ - --query "(snow OR rain) has:media -is:retweet" \ - --filename-prefix beyonce_geo \ - --print-stream - -**Save to file without output** - -.. code:: bash - - search_tweets.py \ - --max-results 100 \ - --results-per-call 100 \ - --query "(snow OR rain) has:media -is:retweet" \ - --filename-prefix weather_pic \ - --no-print-stream - -One or more custom headers can be specified from the command line, using the ``--extra-headers`` argument and a JSON-formatted string representing a dictionary of extra headers: - -.. code:: bash - - search_tweets.py \ - --query "(snow OR rain) has:media -is:retweet" \ - --extra-headers '{"":""}' - -Options can be passed via a configuration file (either ini or YAML). Example files can be found in the ``config/api_config_example.config`` or ``config/api_yaml_example.yaml`` files, which might look like this: - -.. code:: bash - - [search_rules] - start_time = 2020-05-01 - end_time = 2020-05-01 - query = (snow OR rain) has:media -is:retweet - - [search_params] - results_per_call = 100 - max_tweets = 10000 - - [output_params] - save_file = True - filename_prefix = weather-pics - results_per_file = 10000000 - -Or this: - -.. code:: bash - - search_rules: - start_time: 2020-05-01 - end_time: 2020-05-01 01:01 - query: (snow OR rain) has:media -is:retweet - - search_params: - results_per_call: 100 - max_results: 500 - - output_params: - save_file: True - filename_prefix: (snow OR rain) has:media -is:retweet - results_per_file: 10000000 - -Custom headers can be specified in a config file, under a specific credentials key: - -.. code:: yaml - - search_tweets_api: - endpoint: - bearer_token: - extra_headers: - : - -When using a config file in conjunction with the command-line utility, you need to specify your config file via the ``--config-file`` parameter. Additional command-line arguments will either be added to the config file args or overwrite the config file args if both are specified and present. - -Example: - -:: - - search_tweets.py \ - --config-file myapiconfig.config \ - --no-print-stream - ------------------- - -Using the Twitter Search APIs' Python Wrapper -============================================= - -Working with the API within a Python program is straightforward. - -We'll assume that credentials are in the default location, -``~/.twitter_keys.yaml``. - -.. code:: python - - from searchtweets import ResultStream, gen_request_parameters, load_credentials - - -Labs Setup -------------- - -.. code:: python - - labs_search_args = load_credentials("~/.twitter_keys.yaml", - yaml_key="search_tweets_labs", - env_overwrite=False) - - -There is a function that formats search API rules into valid json queries called ``gen_request_parameters``. It has sensible defaults, such as pulling more Tweets per call than the default 10, and not including dates. Discussing the finer points of -generating search rules is out of scope for these examples; we encourage you to see the docs to learn the nuances within, but for now let's see what a query looks like. - -.. code:: python - - rule = gen_request_requests("snow", results_per_call=100) - print(rule) - -:: - - {"query":"snow","max_results":100} - -This rule will match tweets that have the text ``snow`` in them. - -From this point, there are two ways to interact with the API. There is a quick method to collect smaller amounts of Tweets to memory that requires less thought and knowledge, and interaction with the ``ResultStream`` object which will be introduced later. - -Fast Way --------- - -We'll use the ``search_args`` variable to power the configuration point for the API. The object also takes a valid search query and has options to cutoff search when hitting limits on both number of Tweets and endpoint calls. - -We'll be using the ``collect_results`` function, which has three parameters. - -- query: a valid search query, referenced earlier -- max_results: as the API handles pagination, it will stop collecting - when we get to this number -- result_stream_args: configuration args that we've already specified. - -Let's see how it goes: - -.. code:: python - - from searchtweets import collect_results - -.. code:: python - - tweets = collect_results(query, - max_results=100, - result_stream_args=labs_search_args) # change this if you need to - -An overwhelming number of Tweet attributes are made available directly, as such: - -.. code:: python - - [print(tweet.text, end='\n\n') for tweet in tweets[0:10]]; - -:: - - @CleoLoughlin Rain after the snow? Do you have ice now? - - @koofltxr Rain, 134340, still with you, winter bear, Seoul, crystal snow, sea, outro:blueside - - @TheWxMeister Sorry it ruined your camping. I was covering plants in case we got snow in the Mountain Shadows area. Thankfully we didn\u2019t. At least it didn\u2019t stick to the ground. The wind was crazy! Got just over an inch of rain. Looking forward to better weather. - - @brettlorenzen And, the reliability of \u201cNeither snow nor rain nor heat nor gloom of night stays these couriers (the #USPS) from the swift completion of their appointed rounds.\u201d - - Because black people get killed in the rain, black lives matter in the rain. It matters all the time. Snow, rain, sleet, sunny days. We're not out here because it's sunny. We're not out here for fun. We're out here because black lives matter. - - Some of the master copies of the film \u201cGone With the Wind\u201d are archived at the @librarycongress near \u201cSnow White and the Seven Dwarfs\u201d and \u201cSingin\u2019 in the Rain.\u201d GWTW isn\u2019t going to vanish off the face of the earth. - - Snow Man\u306eD.D.\u3068\nSixTONES\u306eImitation Rain\n\u6d41\u308c\u305f\u301c - - @Nonvieta Yup I work in the sanitation industry. I'm in the office however. Life would not go on without our garbage men and women out there. All day everyday rain snow or shine they out there. - - This picture of a rainbow in WA proves nothing. How do we know if this rainbow was not on Mars or the ISS? Maybe it was drawn in on the picture. WA has mail-in voting so we do have to worry aboug rain, snow, poll workers not showing up or voting machines broke on election day !! https://t.co/5WdHx0acS0 https://t.co/BEKtTpBW9g - - Weather in Oslo at 06:00: Clear Temp: 10.6\u00b0C Min today: 9.1\u00b0C Rain today:0.0mm Snow now: 0.0cm Wind N Conditions: Clear Daylight:18:39 hours Sunset: 22:36 - -Voila, we have some Tweets. For interactive environments and other cases where you don't care about collecting your data in a single load or don't need to operate on the stream of Tweets directly, I recommend using this convenience function. - -Working with the ResultStream ------------------------------ - -The ResultStream object will be powered by the ``search_args``, and takes the query and other configuration parameters, including a hard stop on number of pages to limit your API call usage. - -.. code:: python - - rs = ResultStream(query=query, - max_results=500, - max_pages=1, - **labs_search_args) - - print(rs) - - :: - - ResultStream: - { - "endpoint":"https:\/\/fanyv88.com:443\/https\/api.twitter.com\/labs\/2\/tweets\/search", - "request_parameters":{ - "query":"snow", - "max_results":100 - }, - "tweetify":false, - "max_results":1000 - } - -There is a function, ``.stream``, that seamlessly handles requests and pagination for a given query. It returns a generator, and to grab our 1000 Tweets that mention ``snow`` we can do this: - -.. code:: python - - tweets = list(rs.stream()) - -.. code:: python - - # using unidecode to prevent emoji/accents printing - [print(tweet) for tweet in tweets[0:10]]; - -:: - -{"id": "1270572563505254404", "text": "@CleoLoughlin Rain after the snow? Do you have ice now?"} -{"id": "1270570767038599168", "text": "@koofltxr Rain, 134340, still with you, winter bear, Seoul, crystal snow, sea, outro:blueside"} -{"id": "1270570621282340864", "text": "@TheWxMeister Sorry it ruined your camping. I was covering plants in case we got snow in the Mountain Shadows area. Thankfully we didn\u2019t. At least it didn\u2019t stick to the ground. The wind was crazy! Got just over an inch of rain. Looking forward to better weather."} -{"id": "1270569070287630337", "text": "@brettlorenzen And, the reliability of \u201cNeither snow nor rain nor heat nor gloom of night stays these couriers (the #USPS) from the swift completion of their appointed rounds.\u201d"} -{"id": "1270568690447257601", "text": "\"Because black people get killed in the rain, black lives matter in the rain. It matters all the time. Snow, rain, sleet, sunny days. We're not out here because it's sunny. We're not out here for fun. We're out here because black lives matter.\" @wisn12news https://t.co/3kZZ7q2MR9"} -{"id": "1270568607605575680", "text": "Some of the master copies of the film \u201cGone With the Wind\u201d are archived at the @librarycongress near \u201cSnow White and the Seven Dwarfs\u201d and \u201cSingin\u2019 in the Rain.\u201d GWTW isn\u2019t going to vanish off the face of the earth."} -{"id": "1270568437916426240", "text": "Snow Man\u306eD.D.\u3068\nSixTONES\u306eImitation Rain\n\u6d41\u308c\u305f\u301c"} -{"id": "1270568195519373313", "text": "@Nonvieta Yup I work in the sanitation industry. I'm in the office however. Life would not go on without our garbage men and women out there. All day everyday rain snow or shine they out there."} -{"id": "1270567737283117058", "text": "This picture of a rainbow in WA proves nothing. How do we know if this rainbow was not on Mars or the ISS? Maybe it was drawn in on the picture. WA has mail-in voting so we do have to worry aboug rain, snow, poll workers not showing up or voting machines broke on election day !! https://t.co/5WdHx0acS0 https://t.co/BEKtTpBW9g"} -{"id": "1270566386524356608", "text": "Weather in Oslo at 06:00: Clear Temp: 10.6\u00b0C Min today: 9.1\u00b0C Rain today:0.0mm Snow now: 0.0cm Wind N Conditions: Clear Daylight:18:39 hours Sunset: 22:36"} - -Contributing -============ - -Any contributions should follow the following pattern: - -1. Make a feature or bugfix branch, e.g., - ``git checkout -b my_new_feature`` -2. Make your changes in that branch -3. Ensure you bump the version number in ``searchtweets/_version.py`` to - reflect your changes. We use `Semantic - Versioning `__, so non-breaking enhancements - should increment the minor version, e.g., ``1.5.0 -> 1.6.0``, and - bugfixes will increment the last version, ``1.6.0 -> 1.6.1``. -4. Create a pull request - -After the pull request process is accepted, package maintainers will -handle building documentation and distribution to Pypi. - -For reference, distributing to Pypi is accomplished by the following -commands, ran from the root directory in the repo: - -.. code:: bash - - python setup.py bdist_wheel - python setup.py sdist - twine upload dist/* - -How to build the documentation: - -Building the documentation requires a few Sphinx packages to build the -webpages: - -.. code:: bash - - pip install sphinx - pip install sphinx_bootstrap_theme - pip install sphinxcontrib-napoleon - -Then (once your changes are committed to master) you should be able to -run the documentation-generating bash script and follow the -instructions: - -.. code:: bash - - bash build_sphinx_docs.sh master searchtweets - -Note that this README is also generated, and so after any README changes -you'll need to re-build the README (you need pandoc version 2.1+ for -this) and commit the result: - -.. code:: bash - - bash make_readme.sh From 759a56b2e108346640ab7abfdde67651357b9e49 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Wed, 12 Aug 2020 11:41:53 -0600 Subject: [PATCH 23/83] Update README.rst --- README.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.rst b/README.rst index a17c6f1..b1b437d 100644 --- a/README.rst +++ b/README.rst @@ -10,8 +10,6 @@ premium and enterprise tiers of Twitter search. This branch supports the `Twitte This project serves as a wrapper for the Twitter API v2 recent search endpoint, providing a command-line utility and a Python library. -To download and install this package, go to: https://pypi.org/project/searchtweets-v2/ - If you are looking for the original version that works with premium and enterprise versions of search, head on over to the main or ``enterprise-premium`` branch. (Soon, the v2 version will be promoted to the main branch.) From 48f211071d320c428498577fe4586a0e50ef1237 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Wed, 12 Aug 2020 13:29:12 -0600 Subject: [PATCH 24/83] JM: Adding link to pypi.org package --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index b1b437d..a17c6f1 100644 --- a/README.rst +++ b/README.rst @@ -10,6 +10,8 @@ premium and enterprise tiers of Twitter search. This branch supports the `Twitte This project serves as a wrapper for the Twitter API v2 recent search endpoint, providing a command-line utility and a Python library. +To download and install this package, go to: https://pypi.org/project/searchtweets-v2/ + If you are looking for the original version that works with premium and enterprise versions of search, head on over to the main or ``enterprise-premium`` branch. (Soon, the v2 version will be promoted to the main branch.) From d25b26a8a56a124114af0e1aec635a795aa7a409 Mon Sep 17 00:00:00 2001 From: Joe--Chen Date: Thu, 20 Aug 2020 23:06:40 -0400 Subject: [PATCH 25/83] fix two typos in README --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index a17c6f1..620c6fe 100644 --- a/README.rst +++ b/README.rst @@ -418,7 +418,7 @@ generating search rules is out of scope for these examples; we encourage you to .. code:: python - rule = gen_request_requests("snow", results_per_call=100) + rule = gen_request_parameters("snow", results_per_call=100) print(rule) :: @@ -450,7 +450,7 @@ Let's see how it goes: .. code:: python tweets = collect_results(query, - max_results=100, + max_tweets=100, result_stream_args=search_args) # change this if you need to An overwhelming number of Tweet attributes are made available directly, as such: From a96dc0a3f0e363a71d24767b83b07acc22d9ea75 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Mon, 24 Aug 2020 15:00:58 -0600 Subject: [PATCH 26/83] Update README.rst --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 620c6fe..3d5a5d9 100644 --- a/README.rst +++ b/README.rst @@ -1,6 +1,6 @@ -.. image:: https://img.shields.io/endpoint?url=https%3A%2F%2Ffanyv88.com%3A443%2Fhttps%2Ftwbadges.glitch.me%2Fbadges%2Fv2 - :target: https://developer.twitter.com/en/docs/twitter-api - :alt: Twitter API v2 +.. .. image:: https://img.shields.io/endpoint?url=https%3A%2F%2Ffanyv88.com%3A443%2Fhttps%2Ftwbadges.glitch.me%2Fbadges%2Fv2 +.. :target: https://developer.twitter.com/en/docs/twitter-api +.. :alt: Twitter API v2 Python client for the Twitter API v2 recent search endpoint =========================================================== From d40cca1f3ed08b560cb6c45e14ce24df4d726a1b Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Tue, 25 Aug 2020 09:51:29 -0600 Subject: [PATCH 27/83] JM: full support for expansions, returning 'includes' array. --- scripts/search_tweets.py | 5 +++++ searchtweets/api_utils.py | 6 +++++- searchtweets/result_stream.py | 18 +++++++++++++++--- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/scripts/search_tweets.py b/scripts/search_tweets.py index 356c449..6815648 100644 --- a/scripts/search_tweets.py +++ b/scripts/search_tweets.py @@ -91,6 +91,11 @@ def parse_cmd_args(): "(default 10; max 100) - corresponds to " "'max_results' in the API") + argparser.add_argument("--expansions", + dest="expansions", + default=None, + help="""A comma-delimited list of expansions. Specifid expansions results in full objects in the 'includes' response object.""") + argparser.add_argument("--tweet-fields", dest="tweet_fields", default=None, diff --git a/searchtweets/api_utils.py b/searchtweets/api_utils.py index bb7d64f..67ac29a 100644 --- a/searchtweets/api_utils.py +++ b/searchtweets/api_utils.py @@ -83,7 +83,7 @@ def convert_utc_time(datetime_str): def gen_request_parameters(query, results_per_call=None, start_time=None, end_time=None, since_id=None, until_id=None, - tweet_fields=None, + tweet_fields=None, expansions=None, stringify=True): """ @@ -101,6 +101,7 @@ def gen_request_parameters(query, results_per_call=None, end_time (str or None): date format as specified by `convert_utc_time` for the end time of your search. tweet_fields (string): comma-delimted list of Tweet JSON attributes wanted in endpoint responses. Default is "id,created_at,text"). + expansions (string): comma-delimited list of object expansions. stringify (bool): specifies the return type, `dict` or json-formatted `str`. @@ -128,6 +129,8 @@ def gen_request_parameters(query, results_per_call=None, payload["until_id"] = until_id if tweet_fields: payload["tweet.fields"] = tweet_fields + if expansions: + payload["expansions"] = expansions return json.dumps(payload) if stringify else payload @@ -161,6 +164,7 @@ def intify(arg): since_id=config_dict.get("since_id", None), until_id=config_dict.get("until_id", None), tweet_fields=config_dict.get("tweet_fields", None), + expansions=config_dict.get("expansions", None), results_per_call=results_per_call) #count_bucket=config_dict.get("count_bucket", None)) diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index 8d4eccf..a156d22 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -174,7 +174,7 @@ class ResultStream: session_request_counter = 0 def __init__(self, endpoint, request_parameters, bearer_token=None, extra_headers_dict=None, max_tweets=500, - tweetify=True, max_requests=None, **kwargs): + tweetify=False, max_requests=None, **kwargs): self.bearer_token = bearer_token self.extra_headers_dict = extra_headers_dict @@ -215,14 +215,25 @@ def stream(self): self.stream_started = True while True: + if self.current_tweets == None: break + + #Serve up data.tweets. for tweet in self.current_tweets: if self.total_results >= self.max_tweets: break yield self._tweet_func(tweet) self.total_results += 1 + #Serve up "includes" arrays + if self.includes != None: + yield self.includes + + #Serve up meta structure. + if self.meta != None: + yield self.meta + if self.next_token and self.total_results < self.max_tweets and self.n_requests <= self.max_requests: self.request_parameters = merge_dicts(self.request_parameters, {"next_token": self.next_token}) @@ -263,9 +274,10 @@ def execute_request(self): try: resp = json.loads(resp.content.decode(resp.encoding)) - meta = resp.get("meta", None) - self.next_token = meta.get("next_token", None) self.current_tweets = resp.get("data", None) + self.includes = resp.get("includes", None) + self.meta = resp.get("meta", None) + self.next_token = self.meta.get("next_token", None) except: print("Error parsing content as JSON.") From cd8c22c22896df05d958f8d326cacea9938a82dd Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Tue, 25 Aug 2020 14:50:45 -0600 Subject: [PATCH 28/83] JM: adding expansions to cmd-line arguments. --- scripts/poll_tweets.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/poll_tweets.py b/scripts/poll_tweets.py index 8e6b075..e88e1c2 100644 --- a/scripts/poll_tweets.py +++ b/scripts/poll_tweets.py @@ -20,10 +20,8 @@ # --debug flag logging.basicConfig(level=os.environ.get("LOGLEVEL", "ERROR")) - REQUIRED_KEYS = {"query", "endpoint"} - def parse_cmd_args(): argparser = argparse.ArgumentParser() help_msg = """configuration file with all parameters. Far, @@ -68,7 +66,6 @@ def parse_cmd_args(): default=5, help="""Polling interval in minutes. (default: 5 minutes)""") - argparser.add_argument("--start-time", dest="start_time", default=None, @@ -98,6 +95,11 @@ def parse_cmd_args(): "(default 10; max 100) - corresponds to " "'max_results' in the API") + argparser.add_argument("--expansions", + dest="expansions", + default=None, + help="""A comma-delimited list of expansions. Specified expansions results in full objects in the 'includes' response object.""") + argparser.add_argument("--tweet-fields", dest="tweet_fields", default=None, From caf39278067b064c4c4ca82e15518766422528c6 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Tue, 25 Aug 2020 14:53:18 -0600 Subject: [PATCH 29/83] JM: bumping v2 version to 1.0.1 --- searchtweets/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searchtweets/_version.py b/searchtweets/_version.py index 4eadb43..e39cc9e 100644 --- a/searchtweets/_version.py +++ b/searchtweets/_version.py @@ -2,4 +2,4 @@ # Copyright 2020 Twitter, Inc. # Licensed under the MIT License # https://opensource.org/licenses/MIT -VERSION = "1.0.0" +VERSION = "1.0.1" From 3778bb63c9893be8bdf94c2d29a3f185103c91d9 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Tue, 25 Aug 2020 14:54:39 -0600 Subject: [PATCH 30/83] JM: fixing typo in help screen. --- scripts/search_tweets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/search_tweets.py b/scripts/search_tweets.py index 6815648..af4b9f7 100644 --- a/scripts/search_tweets.py +++ b/scripts/search_tweets.py @@ -94,7 +94,7 @@ def parse_cmd_args(): argparser.add_argument("--expansions", dest="expansions", default=None, - help="""A comma-delimited list of expansions. Specifid expansions results in full objects in the 'includes' response object.""") + help="""A comma-delimited list of expansions. Specified expansions results in full objects in the 'includes' response object.""") argparser.add_argument("--tweet-fields", dest="tweet_fields", From 0c7707baa1c76d23ffc274025d3f78c3dc1a66ad Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Wed, 26 Aug 2020 09:49:23 -0600 Subject: [PATCH 31/83] JM: adding in support for all 'fields' request parameters. --- README.rst | 112 +++++++++++++++++++---------------- config/api_yaml_example.yaml | 16 ++--- scripts/poll_tweets.py | 22 ++++++- scripts/search_tweets.py | 22 ++++++- 4 files changed, 112 insertions(+), 60 deletions(-) diff --git a/README.rst b/README.rst index 3d5a5d9..996ab8a 100644 --- a/README.rst +++ b/README.rst @@ -61,6 +61,12 @@ When migrating this Python search client to v2 from the enterprise and premium t In this spirit of updating the parlance used, note that a core method provided by searchtweets/result_stream.py has been renamed. The method `gen_rule_payload` has been updated to `gen_request_parameters`. +**One key update is handling the changes in how the search endpoint returns its data.** The v2 search endpoint returns matching Tweets in a `data` array, along with an `includes` array that provides supporting objects that result from specifying `expansions`. +These expanded objects include Users, referenced Tweets, and attached media. In addition to the `data` and `includes` arrays, the search endpoint also provides a `meta` object that provides the max and min Tweet IDs included in the response, +along with a `next_token` if there is another 'page' of data to request. + +Currently, the v2 client returns the Tweets in the `data` array as individual (and atomic) JSON Tweet objects. This matches the behavior of the original search client. However, after yielding the individual Tweet objects, the client outputs arrays of User, Tweet, and media objects from the `includes` array, followed by the `meta` object. + Finally, the original version of search-tweets-python used a `Tweet Parser `__ to help manage the differences between two different JSON formats ("original" and "Activity Stream"). With v2, there is just one version of Tweet JSON, so this Tweet Parser is not used. In the original code, this Tweet parser was envoked with a `tweetify=True directive. With this v2 version, this use of the Tweet Parser is turned off by instead using `tweetify=False`. @@ -68,99 +74,103 @@ In the original code, this Tweet parser was envoked with a `tweetify=True direct Command-line options ==================== -usage: search_tweets.py - -Connected to pydev debugger (build 173.4674.33) -usage: search_tweets.py [-h] [--credential-file CREDENTIAL_FILE] - [--credential-file-key CREDENTIAL_YAML_KEY] - [--env-overwrite ENV_OVERWRITE] - [--config-file CONFIG_FILENAME] [--query QUERY] - [--start-time START_TIME] [--end-time END_TIME] - [--since-id SINCE_ID] [--until-id UNTIL_ID] - [--results-per-call RESULTS_PER_CALL] - [--expansions EXPANSIONS] - [--tweet-fields TWEET_FIELDS] - [--user-fields USER_FIELDS] - [--media-fields MEDIA_FIELDS] - [--place-fields PLACE_FIELDS] - [--poll-fields POLL_FIELDS] [--max-tweets MAX_TWEETS] - [--max-pages MAX_PAGES] - [--results-per-file RESULTS_PER_FILE] - [--filename-prefix FILENAME_PREFIX] - [--no-print-stream] [--print-stream] - [--extra-headers EXTRA_HEADERS] [--debug] - -optional arguments: - -h, --help show this help message and exit - --credential-file CREDENTIAL_FILE +.. code:: bash + + usage: search_tweets.py [-h] [--credential-file CREDENTIAL_FILE] + [--credential-file-key CREDENTIAL_YAML_KEY] + [--env-overwrite ENV_OVERWRITE] + [--config-file CONFIG_FILENAME] + [--query QUERY] + [--start-time START_TIME] + [--end-time END_TIME] + [--since-id SINCE_ID] + [--until-id UNTIL_ID] + [--results-per-call RESULTS_PER_CALL] + [--expansions EXPANSIONS] + [--tweet-fields TWEET_FIELDS] + [--user-fields USER_FIELDS] + [--media-fields MEDIA_FIELDS] + [--place-fields PLACE_FIELDS] + [--poll-fields POLL_FIELDS] + [--max-tweets MAX_TWEETS] + [--max-pages MAX_PAGES] + [--results-per-file RESULTS_PER_FILE] + [--filename-prefix FILENAME_PREFIX] + [--no-print-stream] + [--print-stream] + [--extra-headers EXTRA_HEADERS] + [--debug] + + -h, --help show this help message and exit + --credential-file CREDENTIAL_FILE Location of the yaml file used to hold your credentials. - --credential-file-key CREDENTIAL_YAML_KEY + --credential-file-key CREDENTIAL_YAML_KEY the key in the credential file used for this session's credentials. Defaults to search_tweets_api - --env-overwrite ENV_OVERWRITE + --env-overwrite ENV_OVERWRITE Overwrite YAML-parsed credentials with any set environment variables. See API docs or readme for details. - --config-file CONFIG_FILENAME + --config-file CONFIG_FILENAME configuration file with all parameters. Far, easier to use than the command-line args version., If a valid file is found, all args will be populated, from there. Remaining command-line args, will overrule args found in the config, file. - --query QUERY Search query. (See: + --query QUERY Search query. (See: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-rule) - --start-time START_TIME + --start-time START_TIME Start of datetime window, format 'YYYY-mm-DDTHH:MM' (default: -7 days) - --end-time END_TIME End of datetime window, format 'YYYY-mm-DDTHH:MM' + --end-time END_TIME End of datetime window, format 'YYYY-mm-DDTHH:MM' (default: most recent date) - --since-id SINCE_ID Tweet ID, will start search from Tweets after this + --since-id SINCE_ID Tweet ID, will start search from Tweets after this one. (See: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/paginate) - --until-id UNTIL_ID Tweet ID, will end search from Tweets before this one. + --until-id UNTIL_ID Tweet ID, will end search from Tweets before this one. (See: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/paginate) - --results-per-call RESULTS_PER_CALL + --results-per-call RESULTS_PER_CALL Number of results to return per call (default 10; max 100) - corresponds to 'max_results' in the API - --expansions EXPANSIONS + --expansions EXPANSIONS A comma-delimited list of object expansions to include in endpoint responses. (API default: "") - --tweet-fields TWEET_FIELDS + --tweet-fields TWEET_FIELDS A comma-delimited list of Tweet JSON attributions to include in endpoint responses. (API default: "id,text") - --user-fields USER_FIELDS + --user-fields USER_FIELDS A comma-delimited list of user JSON attributions to include in endpoint responses. (API default: "id") - --media-fields MEDIA_FIELDS + --media-fields MEDIA_FIELDS A comma-delimited list of media JSON attributions to include in endpoint responses. (API default: "id") - --place-fields PLACE_FIELDS + --place-fields PLACE_FIELDS A comma-delimited list of Twitter Place JSON attributions to include in endpoint responses. (API default: "id") - --poll-fields POLL_FIELDS + --poll-fields POLL_FIELDS A comma-delimited list of Tweet Poll JSON attributions to include in endpoint responses. (API default: "id") - --max-tweets MAX_TWEETS + --max-tweets MAX_TWEETS Maximum number of Tweets to return for this session of requests. - --max-pages MAX_PAGES + --max-pages MAX_PAGES Maximum number of pages/API calls to use for this session. - --results-per-file RESULTS_PER_FILE + --results-per-file RESULTS_PER_FILE Maximum tweets to save per file. - --filename-prefix FILENAME_PREFIX + --filename-prefix FILENAME_PREFIX prefix for the filename where tweet json data will be stored. - --no-print-stream disable print streaming - --print-stream Print tweet stream to stdout - --extra-headers EXTRA_HEADERS + --no-print-stream disable print streaming + --print-stream Print tweet stream to stdout + --extra-headers EXTRA_HEADERS JSON-formatted str representing a dict of additional HTTP request headers - --debug print all info and warning messages + --debug print all info and warning messages Installation @@ -310,7 +320,7 @@ Note that the ``--results-per-call`` flag specifies an argument to the API, not --max-tweets 10000 \ --results-per-call 100 \ --query "(snow OR rain) has:media -is:retweet" \ - --filename-prefix beyonce_geo \ + --filename-prefix weather_pics \ --print-stream **Save to file without output** @@ -321,7 +331,7 @@ Note that the ``--results-per-call`` flag specifies an argument to the API, not --max-tweets 10000 \ --results-per-call 100 \ --query "(snow OR rain) has:media -is:retweet" \ - --filename-prefix weather_pic \ + --filename-prefix weather_pics \ --no-print-stream One or more custom headers can be specified from the command line, using the ``--extra-headers`` argument and a JSON-formatted string representing a dictionary of extra headers: @@ -347,7 +357,7 @@ Options can be passed via a configuration file (either ini or YAML). Example fil [output_params] save_file = True - filename_prefix = weather-pics + filename_prefix = weather_pics results_per_file = 10000000 Or this: diff --git a/config/api_yaml_example.yaml b/config/api_yaml_example.yaml index bf0ca02..ed02a61 100644 --- a/config/api_yaml_example.yaml +++ b/config/api_yaml_example.yaml @@ -1,14 +1,16 @@ -search_rules: - start-time: 2020-01-06 - end-time: 2020-01-10 - query: snow colorado -is:retweet has:media +#search_rules: +# start-time: 2020-01-06 +# end-time: 2020-01-10 +# query: snow colorado -is:retweet has:media search_params: results-per-call: 100 max-tweets: 10000 - tweet-fields: id,created_at,text + tweet-fields: id,created_at,author_id,text,public_metrics,attachments,entities + user-fields: description,location,public_metrics + expansions: author_id,referenced_tweets.id,attachments.media_keys output_params: - save_file: True - filename_prefix: snow-photos + save_file: False + filename_prefix: snow_tweets results_per_file: 100000 diff --git a/scripts/poll_tweets.py b/scripts/poll_tweets.py index e88e1c2..b22b4dc 100644 --- a/scripts/poll_tweets.py +++ b/scripts/poll_tweets.py @@ -103,7 +103,27 @@ def parse_cmd_args(): argparser.add_argument("--tweet-fields", dest="tweet_fields", default=None, - help="""A comma-delimited list of Tweet JSON attributions to include in endpoint responses. (Endpoint default: "id,created_at,text")""") + help="""A comma-delimited list of Tweet JSON attributes to include in endpoint responses. (API default:"id,text")""") + + argparser.add_argument("--user-fields", + dest="user_fields", + default=None, + help="""A comma-delimited list of User JSON attributes to include in endpoint responses. (API default:"id")""") + + argparser.add_argument("--media-fields", + dest="media_fields", + default=None, + help="""A comma-delimited list of media JSON attributes to include in endpoint responses. (API default:"id")""") + + argparser.add_argument("--place-fields", + dest="place_fields", + default=None, + help="""A comma-delimited list of Twitter Place JSON attributes to include in endpoint responses. (API default:"id")""") + + argparser.add_argument("--poll-fields", + dest="poll_fields", + default=None, + help="""A comma-delimited list of Twitter Poll JSON attributes to include in endpoint responses. (API default:"id")""") #client options. argparser.add_argument("--max-tweets", dest="max_tweets", diff --git a/scripts/search_tweets.py b/scripts/search_tweets.py index af4b9f7..8ed1282 100644 --- a/scripts/search_tweets.py +++ b/scripts/search_tweets.py @@ -99,7 +99,27 @@ def parse_cmd_args(): argparser.add_argument("--tweet-fields", dest="tweet_fields", default=None, - help="""A comma-delimited list of Tweet JSON attributions to include in endpoint responses. (default: "id,created_at,text")""") + help="""A comma-delimited list of Tweet JSON attributes to include in endpoint responses. (API default:"id,text")""") + + argparser.add_argument("--user-fields", + dest="user_fields", + default=None, + help="""A comma-delimited list of User JSON attributes to include in endpoint responses. (API default:"id")""") + + argparser.add_argument("--media-fields", + dest="media_fields", + default=None, + help="""A comma-delimited list of media JSON attributes to include in endpoint responses. (API default:"id")""") + + argparser.add_argument("--place-fields", + dest="place_fields", + default=None, + help="""A comma-delimited list of Twitter Place JSON attributes to include in endpoint responses. (API default:"id")""") + + argparser.add_argument("--poll-fields", + dest="poll_fields", + default=None, + help="""A comma-delimited list of Twitter Poll JSON attributes to include in endpoint responses. (API default:"id")""") #client options. argparser.add_argument("--max-tweets", dest="max_tweets", From 04057cfbf973b1bbff61d9d035a13790a9cd921d Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Wed, 26 Aug 2020 09:51:22 -0600 Subject: [PATCH 32/83] JM: bumping version --- searchtweets/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searchtweets/_version.py b/searchtweets/_version.py index e39cc9e..4085068 100644 --- a/searchtweets/_version.py +++ b/searchtweets/_version.py @@ -2,4 +2,4 @@ # Copyright 2020 Twitter, Inc. # Licensed under the MIT License # https://opensource.org/licenses/MIT -VERSION = "1.0.1" +VERSION = "1.0.2" From 196f5676f1936fd310e68a6523fcac4ba764d9da Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Wed, 26 Aug 2020 11:12:54 -0600 Subject: [PATCH 33/83] JM: removing TweetParser references. --- README.rst | 3 +-- searchtweets/api_utils.py | 17 ++++++++++++++++- searchtweets/result_stream.py | 25 +++++++++---------------- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/README.rst b/README.rst index 996ab8a..7ddb7cd 100644 --- a/README.rst +++ b/README.rst @@ -139,8 +139,7 @@ Command-line options in endpoint responses. (API default: "") --tweet-fields TWEET_FIELDS A comma-delimited list of Tweet JSON attributions to - include in endpoint responses. (API default: - "id,text") + include in endpoint responses. (API default: "id, text") --user-fields USER_FIELDS A comma-delimited list of user JSON attributions to include in endpoint responses. (API default: "id") diff --git a/searchtweets/api_utils.py b/searchtweets/api_utils.py index 67ac29a..ca9b8cc 100644 --- a/searchtweets/api_utils.py +++ b/searchtweets/api_utils.py @@ -83,7 +83,9 @@ def convert_utc_time(datetime_str): def gen_request_parameters(query, results_per_call=None, start_time=None, end_time=None, since_id=None, until_id=None, - tweet_fields=None, expansions=None, + tweet_fields=None, user_fields=None, media_fields=None, + place_fields=None, poll_fields=None, + expansions=None, stringify=True): """ @@ -101,6 +103,7 @@ def gen_request_parameters(query, results_per_call=None, end_time (str or None): date format as specified by `convert_utc_time` for the end time of your search. tweet_fields (string): comma-delimted list of Tweet JSON attributes wanted in endpoint responses. Default is "id,created_at,text"). + Also user_fields, media_fields, place_fields, poll_fields expansions (string): comma-delimited list of object expansions. stringify (bool): specifies the return type, `dict` or json-formatted `str`. @@ -129,6 +132,14 @@ def gen_request_parameters(query, results_per_call=None, payload["until_id"] = until_id if tweet_fields: payload["tweet.fields"] = tweet_fields + if user_fields: + payload["user.fields"] = user_fields + if media_fields: + payload["media.fields"] = media_fields + if place_fields: + payload["place.fields"] = place_fields + if poll_fields: + payload["poll.fields"] = poll_fields if expansions: payload["expansions"] = expansions @@ -164,6 +175,10 @@ def intify(arg): since_id=config_dict.get("since_id", None), until_id=config_dict.get("until_id", None), tweet_fields=config_dict.get("tweet_fields", None), + user_fields=config_dict.get("user_fields", None), + media_fields=config_dict.get("media_fields", None), + place_fields=config_dict.get("place_fields", None), + poll_fields=config_dict.get("poll_fields", None), expansions=config_dict.get("expansions", None), results_per_call=results_per_call) #count_bucket=config_dict.get("count_bucket", None)) diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index a156d22..01f9965 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -16,7 +16,6 @@ import ujson as json except ImportError: import json -from tweet_parser.tweet import Tweet from .utils import merge_dicts @@ -27,9 +26,9 @@ def make_session(bearer_token=None, extra_headers_dict=None): """Creates a Requests Session for use. Accepts a bearer token - for Labs. + for v2. Args: - bearer_token (str): token for a Labs user. + bearer_token (str): token for a v2 user. """ if bearer_token is None: @@ -129,10 +128,10 @@ def request(session, url, request_parameters, **kwargs): request_json = json.loads(request_parameters) - #Using POST command, not yet supported in Labs. + #Using POST command, not yet supported in v2. #result = session.post(url, data=request_parameters, **kwargs) - #New Labs-specific code in support of GET requests. + #New v2-specific code in support of GET requests. request_url = urlencode(request_json) url = f"{url}?{request_url}" @@ -146,7 +145,7 @@ class ResultStream: pieces: wrapping metadata around a specific API call and automatic pagination of results. Args: - bearer_token (str): bearer token for Labs. + bearer_token (str): bearer token for v2. endpoint (str): API endpoint. @@ -157,12 +156,8 @@ class ResultStream: from the API call - e.g., setting ``max_tweets = 10`` would return ten results, but an API call will return at minimum 100 results by default. - tweetify (bool): If you are grabbing tweets and not counts, use the - tweet parser library to convert each raw tweet package to a Tweet - with lazy properties. - max_requests (int): A hard cutoff for the number of API calls this - instance will make. Good for testing in Labs environment. + instance will make. Good for testing in v2 environment. extra_headers_dict (dict): custom headers to add Example: @@ -174,14 +169,13 @@ class ResultStream: session_request_counter = 0 def __init__(self, endpoint, request_parameters, bearer_token=None, extra_headers_dict=None, max_tweets=500, - tweetify=False, max_requests=None, **kwargs): + max_requests=None, **kwargs): self.bearer_token = bearer_token self.extra_headers_dict = extra_headers_dict if isinstance(request_parameters, str): request_parameters = json.loads(request_parameters) self.request_parameters = request_parameters - self.tweetify = tweetify # magic number of max tweets if you pass a non_int self.max_tweets = (max_tweets if isinstance(max_tweets, int) else 10 ** 15) @@ -192,7 +186,7 @@ def __init__(self, endpoint, request_parameters, bearer_token=None, extra_header self.current_tweets = None self.next_token = None self.stream_started = False - self._tweet_func = Tweet if tweetify else lambda x: x + self._tweet_func = lambda x: x # magic number of requests! self.max_requests = (max_requests if max_requests is not None else 10 ** 9) @@ -283,8 +277,7 @@ def execute_request(self): print("Error parsing content as JSON.") def __repr__(self): - repr_keys = ["endpoint", "request_parameters", - "tweetify", "max_tweets"] + repr_keys = ["endpoint", "request_parameters", "max_tweets"] str_ = json.dumps(dict([(k, self.__dict__.get(k)) for k in repr_keys]), indent=4) str_ = "ResultStream: \n\t" + str_ From 374921d81a7b24028c4fc0a3f9631171b96f2571 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Wed, 26 Aug 2020 11:14:59 -0600 Subject: [PATCH 34/83] JM: bumping version. --- searchtweets/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searchtweets/_version.py b/searchtweets/_version.py index 4085068..be27f97 100644 --- a/searchtweets/_version.py +++ b/searchtweets/_version.py @@ -2,4 +2,4 @@ # Copyright 2020 Twitter, Inc. # Licensed under the MIT License # https://opensource.org/licenses/MIT -VERSION = "1.0.2" +VERSION = "1.0.3" From e9576f01bebf6879c13bccb58908844f000ed3ea Mon Sep 17 00:00:00 2001 From: j-bucholtz <12256082+j-bucholtz@users.noreply.github.com> Date: Thu, 3 Sep 2020 11:55:16 -0500 Subject: [PATCH 35/83] Update utils.py read_config to use yaml.load_safe since yaml.load is deprecated. --- searchtweets/_version.py | 2 +- searchtweets/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/searchtweets/_version.py b/searchtweets/_version.py index be27f97..f652fae 100644 --- a/searchtweets/_version.py +++ b/searchtweets/_version.py @@ -2,4 +2,4 @@ # Copyright 2020 Twitter, Inc. # Licensed under the MIT License # https://opensource.org/licenses/MIT -VERSION = "1.0.3" +VERSION = "1.0.4" diff --git a/searchtweets/utils.py b/searchtweets/utils.py index 697174d..a6e7ce1 100644 --- a/searchtweets/utils.py +++ b/searchtweets/utils.py @@ -186,7 +186,7 @@ def read_config(filename): if file_type == "yaml": with open(os.path.expanduser(filename)) as f: - config_dict = yaml.load(f) + config_dict = yaml.safe_load(f) config_dict = merge_dicts(*[dict(config_dict[s]) for s in config_dict.keys()]) From e9fd97c29146c7b17c2d71e35436a443d617e0e2 Mon Sep 17 00:00:00 2001 From: dgarcia360 Date: Wed, 16 Sep 2020 12:57:45 +0100 Subject: [PATCH 36/83] Added python-dateutil dependency Fixed scripts error --- searchtweets/_version.py | 2 +- setup.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/searchtweets/_version.py b/searchtweets/_version.py index be27f97..f652fae 100644 --- a/searchtweets/_version.py +++ b/searchtweets/_version.py @@ -2,4 +2,4 @@ # Copyright 2020 Twitter, Inc. # Licensed under the MIT License # https://opensource.org/licenses/MIT -VERSION = "1.0.3" +VERSION = "1.0.4" diff --git a/setup.py b/setup.py index bb7047d..1b04607 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ def parse_version(str_): license='MIT', version=VERSION, python_requires='>=3.3', - install_requires=["requests", "pyyaml"], + install_requires=["requests", "pyyaml", "python-dateutil"], packages=find_packages(), - scripts=["scripts/search.py","scripts/polling.py"], + scripts=["scripts/search_tweets.py","scripts/poll_tweets.py"], ) From c56900b185caeffd644ee4d65baf4939e81028dc Mon Sep 17 00:00:00 2001 From: dgarcia360 Date: Wed, 16 Sep 2020 13:00:06 +0100 Subject: [PATCH 37/83] Improved README examples --- README.rst | 12 ++++++------ setup.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.rst b/README.rst index 7ddb7cd..86ca64c 100644 --- a/README.rst +++ b/README.rst @@ -199,7 +199,7 @@ The simplest credential file should look like this: .. code:: yaml - search_tweets_api: + search_tweets_v2: endpoint: https://api.twitter.com/2/tweets/search/recent consumer_key: consumer_secret: @@ -427,8 +427,8 @@ generating search rules is out of scope for these examples; we encourage you to .. code:: python - rule = gen_request_parameters("snow", results_per_call=100) - print(rule) + query = gen_request_parameters("snow", results_per_call=100) + print(query) :: @@ -466,7 +466,7 @@ An overwhelming number of Tweet attributes are made available directly, as such: .. code:: python - [print(tweet.text, end='\n\n') for tweet in tweets[0:10]]; + [print(tweet.text, end='\n\n') for tweet in tweets[0:10]] :: @@ -499,7 +499,7 @@ The ResultStream object will be powered by the ``search_args``, and takes the qu .. code:: python - rs = ResultStream(query=query, + rs = ResultStream(request_parameters=query, max_results=500, max_pages=1, **search_args) @@ -528,7 +528,7 @@ There is a function, ``.stream``, that seamlessly handles requests and paginatio .. code:: python # using unidecode to prevent emoji/accents printing - [print(tweet) for tweet in tweets[0:10]]; + [print(tweet) for tweet in tweets[0:10]] :: diff --git a/setup.py b/setup.py index 1b04607..7a7fb3d 100644 --- a/setup.py +++ b/setup.py @@ -33,5 +33,5 @@ def parse_version(str_): python_requires='>=3.3', install_requires=["requests", "pyyaml", "python-dateutil"], packages=find_packages(), - scripts=["scripts/search_tweets.py","scripts/poll_tweets.py"], + scripts=["scripts/search_tweets.py", "scripts/poll_tweets.py"], ) From ce6b182a92a5c90cef11490dc28589019a8a412d Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Wed, 16 Sep 2020 14:04:18 -0600 Subject: [PATCH 38/83] JM: some updates for 1.04 Pypi package. --- README.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.rst b/README.rst index 86ca64c..d68b8dc 100644 --- a/README.rst +++ b/README.rst @@ -570,6 +570,12 @@ commands, ran from the root directory in the repo: python setup.py sdist twine upload dist/* +If you receive an error during the ``twine upload`` step, it may due to the README.rst +having something invalid in its RST format. Using a RST linter will help fix that. + +Also, as Pypi updates are made, you may want to clear out previous versions from the package. +This can be done with this command: ``rm -rf build dist *.egg-info`` + How to build the documentation: Building the documentation requires a few Sphinx packages to build the From ef0b5724f4da6995162a4d07e35042335cb396b3 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Wed, 16 Sep 2020 14:13:14 -0600 Subject: [PATCH 39/83] JM: updates for v 1.0.4 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7a7fb3d..5c0887d 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ def parse_version(str_): if line.startswith("VERSION")][0].strip() VERSION = parse_version(_version_line) -setup(name='searchtweets-labs', +setup(name='searchtweets-v2', description="Wrapper for Twitter API v2 recent search endpoint.", url='https://github.com/twitterdev/search-tweets-python', author='Fiona Pigott, Jeff Kolb, Josh Montague, Aaron Gonzales, Jim Moffitt', From d469ee17d3b6ce25d6696f65e035df775c187467 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Wed, 16 Sep 2020 14:25:49 -0600 Subject: [PATCH 40/83] JM: adding a requirements.txt file for users not installing the Pypi package. --- requirements.txt | 145 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8c7313c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,145 @@ +alabaster==0.7.12 +appnope==0.1.0 +asn1crypto==0.24.0 +astroid==2.2.5 +Babel==2.8.0 +backcall==0.1.0 +backports.csv==1.0.7 +beautifulsoup4==4.8.1 +beautifultable==0.8.0 +biopython==1.76 +bleach==3.1.0 +bs4==0.0.1 +cairocffi==1.1.0 +CairoSVG==2.4.2 +certifi==2018.11.29 +cffi==1.11.5 +chardet==3.0.4 +Click==7.0 +cryptography==2.4.2 +cssselect2==0.2.2 +cycler==0.10.0 +decorator==4.3.2 +defusedxml==0.5.0 +docopt==0.6.2 +docutils==0.16 +easydict==1.9 +entrypoints==0.3 +et-xmlfile==1.0.1 +Flask==1.1.1 +future==0.16.0 +html5lib==1.0.1 +idna==2.6 +imagesize==1.2.0 +imgkit==1.0.2 +importlib-metadata==1.6.0 +ipykernel==5.1.0 +ipython==7.2.0 +ipython-genutils==0.2.0 +ipywidgets==7.4.2 +isort==4.3.21 +itsdangerous==1.1.0 +jdcal==1.4.1 +jedi==0.13.2 +Jinja2==2.10 +jsonschema==2.6.0 +jupyter==1.0.0 +jupyter-client==5.2.4 +jupyter-console==6.0.0 +jupyter-core==4.4.0 +keyring==21.2.1 +kiwisolver==1.0.1 +lazy-object-proxy==0.0.0 +lxml==4.4.1 +MarkupSafe==1.1.0 +matplotlib==3.0.3 +mccabe==0.6.1 +mistune==0.8.4 +mysql-connector==2.2.9 +mysql-connector-python==8.0.15 +nbconvert==5.4.1 +nbformat==4.4.0 +notebook==5.7.4 +numpy==1.14.4 +oauthlib==3.0.1 +odfpy==1.4.0 +openpyxl==2.4.11 +packaging==20.3 +pandas==0.25.3 +pandocfilters==1.4.2 +parso==0.3.4 +pexpect==4.6.0 +pickleshare==0.7.5 +Pillow==6.1.0 +pipreqs==0.4.9 +pkginfo==1.5.0.1 +pockets==0.9.1 +presto==0.5.13 +presto-python-client==0.7.0 +prettytable==0.7.2 +prometheus-client==0.5.0 +prompt-toolkit==2.0.8 +protobuf==3.7.0 +psycopg2-binary==2.8.3 +ptyprocess==0.6.0 +pycparser==2.19 +Pygments==2.6.1 +PyHive==0.5.2 +pykerberos==1.2.1 +pylint==2.3.1 +pyparsing==2.3.1 +Pyphen==0.9.5 +PySocks==1.7.0 +python-dateutil==2.7.3 +python-dotenv==0.10.3 +pytz==2018.4 +PyYAML==3.13 +pyzmq==17.1.2 +qtconsole==4.4.3 +readme-renderer==26.0 +records==0.5.3 +requests==2.23.0 +requests-kerberos==0.12.0 +requests-oauthlib==1.2.0 +requests-toolbelt==0.9.1 +scipy==1.4.1 +searchtweets-labs==1.0.0 +Send2Trash==1.5.0 +six==1.12.0 +snowballstemmer==2.0.0 +soupsieve==1.9.4 +Sphinx==3.0.3 +sphinx-bootstrap-theme==0.7.1 +sphinxcontrib-applehelp==1.0.2 +sphinxcontrib-devhelp==1.0.2 +sphinxcontrib-htmlhelp==1.0.3 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-napoleon==0.7 +sphinxcontrib-qthelp==1.0.3 +sphinxcontrib-serializinghtml==1.1.4 +SQLAlchemy==1.3.1 +tablib==0.13.0 +terminado==0.8.1 +testpath==0.4.2 +tinycss2==1.0.2 +tornado==5.1.1 +tqdm==4.46.0 +traitlets==4.3.2 +tweepy==3.8.0 +tweet-parser==1.13.2 +twine==3.1.1 +twitter==1.18.0 +typed-ast==1.4.0 +UNKNOWN==0.0.0 +urllib3==1.22 +virtualenv==16.7.5 +wcwidth==0.1.7 +WeasyPrint==50 +webencodings==0.5.1 +Werkzeug==0.15.6 +widgetsnbextension==3.4.2 +wrapt==1.11.2 +xlrd==1.2.0 +xlwt==1.3.0 +yarg==0.1.9 +zipp==3.1.0 From 4dcca27a443d5afc7f418dcc9624e5228a1aba71 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Wed, 16 Sep 2020 14:29:35 -0600 Subject: [PATCH 41/83] JM: added reference to requirements.txt file. --- README.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.rst b/README.rst index d68b8dc..7e57ecf 100644 --- a/README.rst +++ b/README.rst @@ -180,9 +180,12 @@ The updated Pypi install package for the v2 version is at: https://pypi.org/project/searchtweets-v2/ Another option to work directly with this code by cloning the repository, installing the required Python packages, setting up your credentials, and start making requests. +For those not using the Pypi package, and instead are cloning the repository, a ``requirements.txt`` is provided. Dependencies can be installed with the ``pip install -r requirements.txt`` command. To confirm the your code is ready to go, run the ``$python3 scripts/search-tweets.py -h`` command. You should see the help details shown above. + + Credential Handling =================== From 851bef58f91e77450f6f0ff8f75340bba919dabb Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Tue, 26 Jan 2021 14:58:25 -0700 Subject: [PATCH 42/83] JM: updating help notes on start_time defaults for /all launch. --- scripts/poll_tweets.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/poll_tweets.py b/scripts/poll_tweets.py index b22b4dc..6457300 100644 --- a/scripts/poll_tweets.py +++ b/scripts/poll_tweets.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# Copyright 2020 Twitter, Inc. +# Copyright 2021 Twitter, Inc. # Licensed under the Apache License, Version 2.0 # http://www.apache.org/licenses/LICENSE-2.0 import os @@ -69,15 +69,13 @@ def parse_cmd_args(): argparser.add_argument("--start-time", dest="start_time", default=None, - help="""Start of datetime window, format - 'YYYY-mm-DDTHH:MM' (default: -7 days)""") + help="""Start of datetime window, format 'YYYY-mm-DDTHH:MM' (default: -7 days for /recent, -30 days for /all)""") argparser.add_argument("--end-time", dest="end_time", default=None, help="""End of datetime window, format - 'YYYY-mm-DDTHH:MM' (default: most recent - date)""") + 'YYYY-mm-DDTHH:MM' (default: to 30 seconds before request time)""") argparser.add_argument("--since-id", dest="since_id", From d0884cfabf437ab59a80ba28dd5622d7008578f6 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Tue, 26 Jan 2021 14:59:28 -0700 Subject: [PATCH 43/83] JM: updating help notes on start_time defaults for /all launch. --- scripts/search_tweets.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/scripts/search_tweets.py b/scripts/search_tweets.py index 8ed1282..6cb6227 100644 --- a/scripts/search_tweets.py +++ b/scripts/search_tweets.py @@ -66,14 +66,13 @@ def parse_cmd_args(): dest="start_time", default=None, help="""Start of datetime window, format - 'YYYY-mm-DDTHH:MM' (default: -7 days)""") + 'YYYY-mm-DDTHH:MM' (default: -7 days for /recent, -30 days for /all)""") argparser.add_argument("--end-time", dest="end_time", default=None, help="""End of datetime window, format - 'YYYY-mm-DDTHH:MM' (default: most recent - date)""") + 'YYYY-mm-DDTHH:MM' (default: to 30 seconds before request time)""") argparser.add_argument("--since-id", dest="since_id", @@ -120,8 +119,19 @@ def parse_cmd_args(): dest="poll_fields", default=None, help="""A comma-delimited list of Twitter Poll JSON attributes to include in endpoint responses. (API default:"id")""") + #TODO: add code! + argparser.add_argument("--atomic", + dest="atomic", + action="store_true", + default=False, + help="Inject 'includes' objects into Tweet objects.") + + # argparser.add_argument("--output-options", + # dest="output_options", + # default=None, + # help="Set output options: 'a' - atomic, 'r' - response, 'c' - constructed") + - #client options. argparser.add_argument("--max-tweets", dest="max_tweets", type=int, help="Maximum number of Tweets to return for this session of requests.") @@ -155,6 +165,8 @@ def parse_cmd_args(): default=True, help="Print tweet stream to stdout") + + argparser.add_argument("--extra-headers", dest="extra_headers", type=str, From b60a51be2c2578557ea5d230a85fa218c8e98dbf Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Tue, 26 Jan 2021 15:00:36 -0700 Subject: [PATCH 44/83] JM: Updating README for /search/all launch. --- README.rst | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index 7e57ecf..d2e9153 100644 --- a/README.rst +++ b/README.rst @@ -2,13 +2,26 @@ .. :target: https://developer.twitter.com/en/docs/twitter-api .. :alt: Twitter API v2 -Python client for the Twitter API v2 recent search endpoint +Python client for the Twitter API v2 search endpoints =========================================================== Welcome to the ``v2`` branch of the Python search client. This branch was born from the main branch that supports -premium and enterprise tiers of Twitter search. This branch supports the `Twitter API v2 recent search endpoint `__ only, and drops support for the premium and enterprise tiers. +premium and enterprise tiers of Twitter search. This branch supports the `Twitter API v2 'recent' amd 'all' search endpoints `__ only, and drops support for the premium and enterprise tiers. -This project serves as a wrapper for the Twitter API v2 recent search endpoint, providing a command-line utility and a Python library. +This project serves as a wrapper for the Twitter API v2 search endpoints (/search/recent and /search/all), providing a command-line utility and a Python library. + +The search endpoint you want to hit is specified in the library's YAML file: + +.. code:: yaml + + search_tweets_v2: + endpoint: https://api.twitter.com/2/tweets/search/recent #Or https://api.twitter.com/2/tweets/search/all + + +The 'recent' search endpoint provides Tweets from the past 7 days. The 'all' search endpoint, launched in January 2021 as part of the 'academic research' tier of Twitter API v2 access, +provides access to all publicly avaialble Tweets posted since March 2006. + +To learn more about the Twitter academic research program, see this [Twitter blog post](https://blog.twitter.com/developer/en_us/topics/tips/2021/enabling-the-future-of-academic-research-with-the-twitter-api.html). To download and install this package, go to: https://pypi.org/project/searchtweets-v2/ @@ -19,7 +32,7 @@ the main or ``enterprise-premium`` branch. (Soon, the v2 version will be promote Features ======== -- Supports Twitter API v2 recent search. +- Supports Twitter API v2 'recent' and 'all' search. - Supports the configuration of v2 `expansions `_ and `fields `_. - Supports a new "polling" mode using the ``since-id`` search request parameter. The ``since-id``, along with the new ``until-id`` provide a way to navigate the public Tweet archive by Tweet ID. - Supports additional ways to specify ``start-time`` and ``end-time`` request parameters: @@ -39,7 +52,7 @@ These features were inherited from the enterprise/premium version: - Flexible usage within a python program. -Twitter API v2 recent search updates +Twitter API v2 search updates ==================================== Twitter API v2 represents an opportunity to apply previous learnings from building Twitter API v1.1. and the premium and enterprise tiers of endpoints, and redesign and rebuild from the ground up. While building this v2 version of the `search-tweets-python` library, @@ -122,9 +135,9 @@ Command-line options https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-rule) --start-time START_TIME Start of datetime window, format 'YYYY-mm-DDTHH:MM' - (default: -7 days) + (default: -7 days for /recent, -30 days for /all) --end-time END_TIME End of datetime window, format 'YYYY-mm-DDTHH:MM' - (default: most recent date) + (default: to 30 seconds before request time) --since-id SINCE_ID Tweet ID, will start search from Tweets after this one. (See: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/paginate) @@ -189,7 +202,7 @@ To confirm the your code is ready to go, run the ``$python3 scripts/search-tweet Credential Handling =================== -The Twitter API v2 recent search endpoint uses app-only authentication. You have the choice to configure your application consumer key and secret, or a Bearer Token you have generated. If you supply the application key and secret, the client will generate a Bearer Token for you. +The Twitter API v2 search endpoints uses app-only authentication. You have the choice to configure your application consumer key and secret, or a Bearer Token you have generated. If you supply the application key and secret, the client will generate a Bearer Token for you. Many developers might find providing your application key and secret more straightforward and letting this library manage your Bearer Token generation for you. Please see `HERE `_ for an overview of the app-only authentication method. From 986d337b231a18d0e9a2e96215691c7cb75dc584 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 26 Jan 2021 15:02:19 -0700 Subject: [PATCH 45/83] Update README.rst --- README.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index d2e9153..d8934ea 100644 --- a/README.rst +++ b/README.rst @@ -6,7 +6,7 @@ Python client for the Twitter API v2 search endpoints =========================================================== Welcome to the ``v2`` branch of the Python search client. This branch was born from the main branch that supports -premium and enterprise tiers of Twitter search. This branch supports the `Twitter API v2 'recent' amd 'all' search endpoints `__ only, and drops support for the premium and enterprise tiers. +premium and enterprise tiers of Twitter search. This branch supports the `Twitter API v2 'recent' and 'all' search endpoints `__ only, and drops support for the premium and enterprise tiers. This project serves as a wrapper for the Twitter API v2 search endpoints (/search/recent and /search/all), providing a command-line utility and a Python library. @@ -18,8 +18,7 @@ The search endpoint you want to hit is specified in the library's YAML file: endpoint: https://api.twitter.com/2/tweets/search/recent #Or https://api.twitter.com/2/tweets/search/all -The 'recent' search endpoint provides Tweets from the past 7 days. The 'all' search endpoint, launched in January 2021 as part of the 'academic research' tier of Twitter API v2 access, -provides access to all publicly avaialble Tweets posted since March 2006. +The 'recent' search endpoint provides Tweets from the past 7 days. The 'all' search endpoint, launched in January 2021 as part of the 'academic research' tier of Twitter API v2 access, provides access to all publicly avaialble Tweets posted since March 2006. To learn more about the Twitter academic research program, see this [Twitter blog post](https://blog.twitter.com/developer/en_us/topics/tips/2021/enabling-the-future-of-academic-research-with-the-twitter-api.html). From 7460af7f044f1ed800038220d87a533a3b15cf6b Mon Sep 17 00:00:00 2001 From: lk251 <21109904+lk251@users.noreply.github.com> Date: Wed, 17 Feb 2021 20:51:16 +0100 Subject: [PATCH 46/83] Update search_tweets.py update default credential key for v2 --- scripts/search_tweets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/search_tweets.py b/scripts/search_tweets.py index 6cb6227..a00db03 100644 --- a/scripts/search_tweets.py +++ b/scripts/search_tweets.py @@ -40,10 +40,10 @@ def parse_cmd_args(): argparser.add_argument("--credential-file-key", dest="credential_yaml_key", - default=None, + default="search_tweets_v2", help=("the key in the credential file used " "for this session's credentials. " - "Defaults to search_tweets_api")) + "Defaults to search_tweets_v2")) argparser.add_argument("--env-overwrite", dest="env_overwrite", @@ -243,4 +243,4 @@ def main(): print(json.dumps(tweet)) if __name__ == '__main__': - main() \ No newline at end of file + main() From 318eb26dd05fe67bb8f519f80767cbf3d1addd5b Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Fri, 19 Feb 2021 08:33:38 +0000 Subject: [PATCH 47/83] enable output options. this commit just as a shortcut to fork the v2 branch as a reminder to myself --- scripts/search_tweets.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/scripts/search_tweets.py b/scripts/search_tweets.py index 6cb6227..fa38434 100644 --- a/scripts/search_tweets.py +++ b/scripts/search_tweets.py @@ -126,11 +126,10 @@ def parse_cmd_args(): default=False, help="Inject 'includes' objects into Tweet objects.") - # argparser.add_argument("--output-options", - # dest="output_options", - # default=None, - # help="Set output options: 'a' - atomic, 'r' - response, 'c' - constructed") - + argparser.add_argument("--output-options", + dest="output_options", + default=None, + help="Set output options: 'a' - atomic, 'r' - response, 'c' - constructed") argparser.add_argument("--max-tweets", dest="max_tweets", type=int, @@ -243,4 +242,4 @@ def main(): print(json.dumps(tweet)) if __name__ == '__main__': - main() \ No newline at end of file + main() From 7d87d6f979be2d468c3e77964b5fd6df97a95dda Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Fri, 19 Feb 2021 23:53:48 +0000 Subject: [PATCH 48/83] add output formats to result stream --- searchtweets/result_stream.py | 97 +++++++++++++++++++++++++++++------ 1 file changed, 82 insertions(+), 15 deletions(-) diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index 01f9965..16872a1 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -18,6 +18,7 @@ import json from .utils import merge_dicts +from collections import defaultdict from ._version import VERSION @@ -183,6 +184,7 @@ def __init__(self, endpoint, request_parameters, bearer_token=None, extra_header self.total_results = 0 self.n_requests = 0 self.session = None + self.current_response = None self.current_tweets = None self.next_token = None self.stream_started = False @@ -192,6 +194,83 @@ def __init__(self, endpoint, request_parameters, bearer_token=None, extra_header else 10 ** 9) self.endpoint = endpoint + self.output_format = "a" # output options: 'a' - atomic, 'r' - response, 'c' - constructed" # todo: hardcode for now, use command line arguments + + def formatted_output(self): + # Defaults: Return empty objects for things missing in includes. + includes_media = defaultdict(lambda: {}, {media["media_key"]: media for media in self.includes["media"]}) if "media" in self.includes else defaultdict(lambda: {}) + includes_users = defaultdict(lambda: {}, {user["id"]: user for user in self.includes["users"]}) if "users" in self.includes else defaultdict(lambda: {}) # todo: check for user expansions (pinned tweet id?) + includes_polls = defaultdict(lambda: {}, {poll["id"]: poll for poll in self.includes["polls"]}) if "polls" in self.includes else defaultdict(lambda: {}) + includes_place = defaultdict(lambda: {}, {place["id"]: place for place in self.includes["places"]}) if "places" in self.includes else defaultdict(lambda: {}) + includes_user_names = defaultdict(lambda: {}, {user["username"]: user for user in self.includes["users"]}) if "users" in self.includes else defaultdict(lambda: {}) # find by username, needed for mentions + includes_tweets = defaultdict(lambda: {}, {tweet["id"]: tweet for tweet in self.includes["tweets"]}) if "tweets" in self.includes else defaultdict(lambda: {}) + + def expand_tweet(tweet): + if "author_id" in tweet: + tweet["author"] = includes_users[tweet["author_id"]] + if "in_reply_to_user_id" in tweet: + tweet["in_reply_to_user"] = includes_users[tweet["in_reply_to_user_id"]] + if "attachments" in tweet: + if "media_keys" in tweet["attachments"]: + tweet["attachments"]["media"] = list(includes_media[media_key] for media_key in tweet["attachments"]["media_keys"]) + if "poll_ids" in tweet["attachments"]: + tweet["attachments"]["polls"] = list(includes_polls[poll_id] for poll_id in tweet["attachments"]["poll_ids"]) + if "geo" in tweet and len(includes_place) > 0: + tweet["geo"] = list(merge_dicts(referenced_place, includes_place[referenced_place['place_id']]) for referenced_place in tweet["geo"]) + if "entities" in tweet: + if "mentions" in tweet["entities"]: + tweet["entities"]["mentions"] = list(merge_dicts(referenced_user, includes_user_names[referenced_user['username']]) for referenced_user in tweet["entities"]["mentions"]) + if "referenced_tweets" in tweet: + tweet["referenced_tweets"] = list(merge_dicts(referenced_tweet, includes_tweets[referenced_tweet['id']]) for referenced_tweet in tweet["referenced_tweets"]) + return tweet + + # Now expand the included tweets ahead of time using all of the above + includes_tweets = defaultdict(lambda: {}, {tweet["id"]: expand_tweet(tweet) for tweet in self.includes["tweets"]}) if "tweets" in self.includes else defaultdict(lambda: {}) + + def output_response_format(): + """ + output the response as 1 "page" per line + """ + if self.total_results >= self.max_tweets: + return + yield self.current_response + self.total_results += self.meta['result_count'] + + def output_constructed_format(): + """ + output the way it was implemented originally + """ + #Serve up data.tweets. + for tweet in self.current_tweets: + if self.total_results >= self.max_tweets: + break + yield self._tweet_func(tweet) + self.total_results += 1 + + #Serve up "includes" arrays + if self.includes != None: + yield self.includes + + #Serve up meta structure. + if self.meta != None: + yield self.meta + + def output_atomic_format(): + """ + Format the results with "atomic" objects: + """ + for tweet in self.current_tweets: + if self.total_results >= self.max_tweets: + break + yield self._tweet_func(expand_tweet(tweet)) + self.total_results += 1 + + response_format = {"r": output_response_format, + "c": output_constructed_format, + "a": output_atomic_format} + + return response_format.get(self.output_format, "a")() + def stream(self): """ Main entry point for the data from the API. Will automatically paginate @@ -212,21 +291,7 @@ def stream(self): if self.current_tweets == None: break - - #Serve up data.tweets. - for tweet in self.current_tweets: - if self.total_results >= self.max_tweets: - break - yield self._tweet_func(tweet) - self.total_results += 1 - - #Serve up "includes" arrays - if self.includes != None: - yield self.includes - - #Serve up meta structure. - if self.meta != None: - yield self.meta + yield from self.formatted_output() if self.next_token and self.total_results < self.max_tweets and self.n_requests <= self.max_requests: self.request_parameters = merge_dicts(self.request_parameters, @@ -238,6 +303,7 @@ def stream(self): break logger.info("ending stream at {} tweets".format(self.total_results)) + self.current_response = None self.current_tweets = None self.session.close() @@ -268,6 +334,7 @@ def execute_request(self): try: resp = json.loads(resp.content.decode(resp.encoding)) + self.current_response = resp self.current_tweets = resp.get("data", None) self.includes = resp.get("includes", None) self.meta = resp.get("meta", None) From de965c72d6b0d050761fd59bac495f53494d610d Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 20 Feb 2021 00:56:03 +0000 Subject: [PATCH 49/83] simplify extracting expansions a bit --- searchtweets/result_stream.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index 16872a1..54a86eb 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -197,13 +197,25 @@ def __init__(self, endpoint, request_parameters, bearer_token=None, extra_header self.output_format = "a" # output options: 'a' - atomic, 'r' - response, 'c' - constructed" # todo: hardcode for now, use command line arguments def formatted_output(self): - # Defaults: Return empty objects for things missing in includes. - includes_media = defaultdict(lambda: {}, {media["media_key"]: media for media in self.includes["media"]}) if "media" in self.includes else defaultdict(lambda: {}) - includes_users = defaultdict(lambda: {}, {user["id"]: user for user in self.includes["users"]}) if "users" in self.includes else defaultdict(lambda: {}) # todo: check for user expansions (pinned tweet id?) - includes_polls = defaultdict(lambda: {}, {poll["id"]: poll for poll in self.includes["polls"]}) if "polls" in self.includes else defaultdict(lambda: {}) - includes_place = defaultdict(lambda: {}, {place["id"]: place for place in self.includes["places"]}) if "places" in self.includes else defaultdict(lambda: {}) - includes_user_names = defaultdict(lambda: {}, {user["username"]: user for user in self.includes["users"]}) if "users" in self.includes else defaultdict(lambda: {}) # find by username, needed for mentions - includes_tweets = defaultdict(lambda: {}, {tweet["id"]: tweet for tweet in self.includes["tweets"]}) if "tweets" in self.includes else defaultdict(lambda: {}) + + def extract_includes(expansion, _id="id"): + """ + Return empty objects for things missing in includes. + """ + if expansion in self.includes: + return defaultdict( + lambda: {}, + {include[_id]: include for include in self.includes[expansion]}, + ) + else: + return defaultdict(lambda: {}) + + includes_media = extract_includes("media", "media_key") + includes_users = extract_includes("users") # todo: check for user expansions (pinned tweet id?) + includes_user_names = extract_includes("users", "username") # find by username, needed for mentions + includes_polls = extract_includes("polls") + includes_place = extract_includes("places") + includes_tweets = extract_includes("tweets") def expand_tweet(tweet): if "author_id" in tweet: @@ -225,7 +237,8 @@ def expand_tweet(tweet): return tweet # Now expand the included tweets ahead of time using all of the above - includes_tweets = defaultdict(lambda: {}, {tweet["id"]: expand_tweet(tweet) for tweet in self.includes["tweets"]}) if "tweets" in self.includes else defaultdict(lambda: {}) + if "tweets" in self.includes: + includes_tweets = defaultdict(lambda: {}, {tweet["id"]: expand_tweet(tweet) for tweet in self.includes["tweets"]}) def output_response_format(): """ From 7eb774a6a2f37a1b7ac87699bbe1f082070f7ee1 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 20 Feb 2021 01:06:58 +0000 Subject: [PATCH 50/83] add atomic and output format args options --- scripts/search_tweets.py | 2 +- searchtweets/result_stream.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/search_tweets.py b/scripts/search_tweets.py index fa38434..1d8df5d 100644 --- a/scripts/search_tweets.py +++ b/scripts/search_tweets.py @@ -119,7 +119,7 @@ def parse_cmd_args(): dest="poll_fields", default=None, help="""A comma-delimited list of Twitter Poll JSON attributes to include in endpoint responses. (API default:"id")""") - #TODO: add code! + argparser.add_argument("--atomic", dest="atomic", action="store_true", diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index 54a86eb..49cb641 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -170,7 +170,7 @@ class ResultStream: session_request_counter = 0 def __init__(self, endpoint, request_parameters, bearer_token=None, extra_headers_dict=None, max_tweets=500, - max_requests=None, **kwargs): + max_requests=None, atomic=False, output_options="c", **kwargs): self.bearer_token = bearer_token self.extra_headers_dict = extra_headers_dict @@ -193,8 +193,10 @@ def __init__(self, endpoint, request_parameters, bearer_token=None, extra_header self.max_requests = (max_requests if max_requests is not None else 10 ** 9) self.endpoint = endpoint - - self.output_format = "a" # output options: 'a' - atomic, 'r' - response, 'c' - constructed" # todo: hardcode for now, use command line arguments + # Tweet output format: 'a' - atomic, 'r' - response, 'c' - constructed + if atomic: + self.output_format = "a" + self.output_format = output_options def formatted_output(self): From c4f64ccba6dc8b0a6fc66476d4f28591c0f660b2 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 20 Feb 2021 01:11:03 +0000 Subject: [PATCH 51/83] require fewer dependencies in requirements.txt there are a lot of extra ones in the list that are not needed. --- requirements.txt | 148 +---------------------------------------------- 1 file changed, 3 insertions(+), 145 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8c7313c..c4f9676 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,145 +1,3 @@ -alabaster==0.7.12 -appnope==0.1.0 -asn1crypto==0.24.0 -astroid==2.2.5 -Babel==2.8.0 -backcall==0.1.0 -backports.csv==1.0.7 -beautifulsoup4==4.8.1 -beautifultable==0.8.0 -biopython==1.76 -bleach==3.1.0 -bs4==0.0.1 -cairocffi==1.1.0 -CairoSVG==2.4.2 -certifi==2018.11.29 -cffi==1.11.5 -chardet==3.0.4 -Click==7.0 -cryptography==2.4.2 -cssselect2==0.2.2 -cycler==0.10.0 -decorator==4.3.2 -defusedxml==0.5.0 -docopt==0.6.2 -docutils==0.16 -easydict==1.9 -entrypoints==0.3 -et-xmlfile==1.0.1 -Flask==1.1.1 -future==0.16.0 -html5lib==1.0.1 -idna==2.6 -imagesize==1.2.0 -imgkit==1.0.2 -importlib-metadata==1.6.0 -ipykernel==5.1.0 -ipython==7.2.0 -ipython-genutils==0.2.0 -ipywidgets==7.4.2 -isort==4.3.21 -itsdangerous==1.1.0 -jdcal==1.4.1 -jedi==0.13.2 -Jinja2==2.10 -jsonschema==2.6.0 -jupyter==1.0.0 -jupyter-client==5.2.4 -jupyter-console==6.0.0 -jupyter-core==4.4.0 -keyring==21.2.1 -kiwisolver==1.0.1 -lazy-object-proxy==0.0.0 -lxml==4.4.1 -MarkupSafe==1.1.0 -matplotlib==3.0.3 -mccabe==0.6.1 -mistune==0.8.4 -mysql-connector==2.2.9 -mysql-connector-python==8.0.15 -nbconvert==5.4.1 -nbformat==4.4.0 -notebook==5.7.4 -numpy==1.14.4 -oauthlib==3.0.1 -odfpy==1.4.0 -openpyxl==2.4.11 -packaging==20.3 -pandas==0.25.3 -pandocfilters==1.4.2 -parso==0.3.4 -pexpect==4.6.0 -pickleshare==0.7.5 -Pillow==6.1.0 -pipreqs==0.4.9 -pkginfo==1.5.0.1 -pockets==0.9.1 -presto==0.5.13 -presto-python-client==0.7.0 -prettytable==0.7.2 -prometheus-client==0.5.0 -prompt-toolkit==2.0.8 -protobuf==3.7.0 -psycopg2-binary==2.8.3 -ptyprocess==0.6.0 -pycparser==2.19 -Pygments==2.6.1 -PyHive==0.5.2 -pykerberos==1.2.1 -pylint==2.3.1 -pyparsing==2.3.1 -Pyphen==0.9.5 -PySocks==1.7.0 -python-dateutil==2.7.3 -python-dotenv==0.10.3 -pytz==2018.4 -PyYAML==3.13 -pyzmq==17.1.2 -qtconsole==4.4.3 -readme-renderer==26.0 -records==0.5.3 -requests==2.23.0 -requests-kerberos==0.12.0 -requests-oauthlib==1.2.0 -requests-toolbelt==0.9.1 -scipy==1.4.1 -searchtweets-labs==1.0.0 -Send2Trash==1.5.0 -six==1.12.0 -snowballstemmer==2.0.0 -soupsieve==1.9.4 -Sphinx==3.0.3 -sphinx-bootstrap-theme==0.7.1 -sphinxcontrib-applehelp==1.0.2 -sphinxcontrib-devhelp==1.0.2 -sphinxcontrib-htmlhelp==1.0.3 -sphinxcontrib-jsmath==1.0.1 -sphinxcontrib-napoleon==0.7 -sphinxcontrib-qthelp==1.0.3 -sphinxcontrib-serializinghtml==1.1.4 -SQLAlchemy==1.3.1 -tablib==0.13.0 -terminado==0.8.1 -testpath==0.4.2 -tinycss2==1.0.2 -tornado==5.1.1 -tqdm==4.46.0 -traitlets==4.3.2 -tweepy==3.8.0 -tweet-parser==1.13.2 -twine==3.1.1 -twitter==1.18.0 -typed-ast==1.4.0 -UNKNOWN==0.0.0 -urllib3==1.22 -virtualenv==16.7.5 -wcwidth==0.1.7 -WeasyPrint==50 -webencodings==0.5.1 -Werkzeug==0.15.6 -widgetsnbextension==3.4.2 -wrapt==1.11.2 -xlrd==1.2.0 -xlwt==1.3.0 -yarg==0.1.9 -zipp==3.1.0 +requests +PyYAML +python-dateutil From 415656525fde8d0fd1969803346234eac9c74083 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sun, 21 Feb 2021 21:16:12 +0000 Subject: [PATCH 52/83] fix geo expansion merge --- searchtweets/result_stream.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index 49cb641..f7f0297 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -230,7 +230,7 @@ def expand_tweet(tweet): if "poll_ids" in tweet["attachments"]: tweet["attachments"]["polls"] = list(includes_polls[poll_id] for poll_id in tweet["attachments"]["poll_ids"]) if "geo" in tweet and len(includes_place) > 0: - tweet["geo"] = list(merge_dicts(referenced_place, includes_place[referenced_place['place_id']]) for referenced_place in tweet["geo"]) + tweet["geo"] = merge_dicts(tweet["geo"], includes_place[tweet["geo"]['place_id']]) if "entities" in tweet: if "mentions" in tweet["entities"]: tweet["entities"]["mentions"] = list(merge_dicts(referenced_user, includes_user_names[referenced_user['username']]) for referenced_user in tweet["entities"]["mentions"]) @@ -393,4 +393,4 @@ def collect_results(query, max_tweets=1000, result_stream_args=None): rs = ResultStream(request_parameters=query, max_tweets=max_tweets, **result_stream_args) - return list(rs.stream()) \ No newline at end of file + return list(rs.stream()) From 05c3d40bb181bfce7c87a161c0c6a63fe2832082 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 27 Feb 2021 21:00:49 +0000 Subject: [PATCH 53/83] make polls and geo 1 object, instead of a list --- searchtweets/result_stream.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index f7f0297..d9a9d35 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -228,9 +228,11 @@ def expand_tweet(tweet): if "media_keys" in tweet["attachments"]: tweet["attachments"]["media"] = list(includes_media[media_key] for media_key in tweet["attachments"]["media_keys"]) if "poll_ids" in tweet["attachments"]: - tweet["attachments"]["polls"] = list(includes_polls[poll_id] for poll_id in tweet["attachments"]["poll_ids"]) + poll_id = tweet["attachments"]["poll_ids"][-1] + tweet["attachments"]["poll"] = includes_polls[poll_id] if "geo" in tweet and len(includes_place) > 0: - tweet["geo"] = merge_dicts(tweet["geo"], includes_place[tweet["geo"]['place_id']]) + place_id = tweet["geo"]['place_id'] + tweet["geo"] = merge_dicts(tweet["geo"], includes_place[place_id]) if "entities" in tweet: if "mentions" in tweet["entities"]: tweet["entities"]["mentions"] = list(merge_dicts(referenced_user, includes_user_names[referenced_user['username']]) for referenced_user in tweet["entities"]["mentions"]) From 4a039b06350fef2567b8553427ec62a2f6feaff7 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sun, 28 Feb 2021 21:44:13 +0000 Subject: [PATCH 54/83] alternative way to expand results --- searchtweets/result_stream.py | 88 +++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 30 deletions(-) diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index d9a9d35..9688286 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -212,37 +212,65 @@ def extract_includes(expansion, _id="id"): else: return defaultdict(lambda: {}) + # Users extracted both by id and by username for expanding mentions + includes_users = merge_dicts(extract_includes("users"), extract_includes("users", "username")) + # Tweets in includes will themselves be expanded + includes_tweets = extract_includes("tweets") + # Media is by media_key, not id includes_media = extract_includes("media", "media_key") - includes_users = extract_includes("users") # todo: check for user expansions (pinned tweet id?) - includes_user_names = extract_includes("users", "username") # find by username, needed for mentions includes_polls = extract_includes("polls") - includes_place = extract_includes("places") - includes_tweets = extract_includes("tweets") + includes_places = extract_includes("places") + # Errors are returned but unused here + includes_errors = extract_includes("errors") + + def expand_payload(payload): + """ + Recursively step through an object and sub objects and append extra data. + """ + + # Don't try to expand on primitive values, return strings as is: + if isinstance(payload, (str, bool, int, float)): + return payload + # expand list items individually: + elif isinstance(payload, list): + payload = [expand_payload(item) for item in payload] + return payload + # Try to expand on dicts within dicts: + elif isinstance(payload, dict): + for key, value in payload.items(): + payload[key] = expand_payload(value) + + if "author_id" in payload: + payload["author"] = includes_users[payload["author_id"]] + + if "in_reply_to_user_id" in payload: + payload["in_reply_to_user"] = includes_users[payload["in_reply_to_user_id"]] + + if "media_keys" in payload: + payload["media"] = list(includes_media[media_key] for media_key in payload["media_keys"]) + + if "poll_ids" in payload: + poll_id = payload["poll_ids"][-1] # always 1, only 1 poll per tweet. + payload["poll"] = includes_polls[poll_id] + + if "geo" in payload: + place_id = payload["geo"]['place_id'] + payload["geo"] = merge_dicts(payload["geo"], includes_places[place_id]) + + if "mentions" in payload: + payload["mentions"] = list(merge_dicts(referenced_user, includes_users[referenced_user['username']]) for referenced_user in payload["mentions"]) + + if "referenced_tweets" in payload: + payload["referenced_tweets"] = list(merge_dicts(referenced_tweet, includes_tweets[referenced_tweet['id']]) for referenced_tweet in payload["referenced_tweets"]) + + if "pinned_tweet_id" in payload: + payload["pinned_tweet"] = includes_tweets[payload["pinned_tweet_id"]] + + return payload - def expand_tweet(tweet): - if "author_id" in tweet: - tweet["author"] = includes_users[tweet["author_id"]] - if "in_reply_to_user_id" in tweet: - tweet["in_reply_to_user"] = includes_users[tweet["in_reply_to_user_id"]] - if "attachments" in tweet: - if "media_keys" in tweet["attachments"]: - tweet["attachments"]["media"] = list(includes_media[media_key] for media_key in tweet["attachments"]["media_keys"]) - if "poll_ids" in tweet["attachments"]: - poll_id = tweet["attachments"]["poll_ids"][-1] - tweet["attachments"]["poll"] = includes_polls[poll_id] - if "geo" in tweet and len(includes_place) > 0: - place_id = tweet["geo"]['place_id'] - tweet["geo"] = merge_dicts(tweet["geo"], includes_place[place_id]) - if "entities" in tweet: - if "mentions" in tweet["entities"]: - tweet["entities"]["mentions"] = list(merge_dicts(referenced_user, includes_user_names[referenced_user['username']]) for referenced_user in tweet["entities"]["mentions"]) - if "referenced_tweets" in tweet: - tweet["referenced_tweets"] = list(merge_dicts(referenced_tweet, includes_tweets[referenced_tweet['id']]) for referenced_tweet in tweet["referenced_tweets"]) - return tweet - - # Now expand the included tweets ahead of time using all of the above - if "tweets" in self.includes: - includes_tweets = defaultdict(lambda: {}, {tweet["id"]: expand_tweet(tweet) for tweet in self.includes["tweets"]}) + # First, expand the included tweets, before processing actual result tweets: + for included_id, included_tweet in extract_includes("tweets").items(): + includes_tweets[included_id] = expand_payload(included_tweet) def output_response_format(): """ @@ -279,14 +307,14 @@ def output_atomic_format(): for tweet in self.current_tweets: if self.total_results >= self.max_tweets: break - yield self._tweet_func(expand_tweet(tweet)) + yield self._tweet_func(expand_payload(tweet)) self.total_results += 1 response_format = {"r": output_response_format, "c": output_constructed_format, "a": output_atomic_format} - return response_format.get(self.output_format, "a")() + return response_format.get(self.output_format)() def stream(self): """ From d18d2cd9564147be91c4b890ce65256ef96c9c88 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sun, 28 Feb 2021 21:46:56 +0000 Subject: [PATCH 55/83] whitespace --- searchtweets/result_stream.py | 1 + 1 file changed, 1 insertion(+) diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index 9688286..ed022be 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -424,3 +424,4 @@ def collect_results(query, max_tweets=1000, result_stream_args=None): max_tweets=max_tweets, **result_stream_args) return list(rs.stream()) + From 769da9354b5e65b6a5bcdac8196eea9a7ead32af Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sun, 28 Feb 2021 21:47:17 +0000 Subject: [PATCH 56/83] Update result_stream.py --- searchtweets/result_stream.py | 1 - 1 file changed, 1 deletion(-) diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index ed022be..9688286 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -424,4 +424,3 @@ def collect_results(query, max_tweets=1000, result_stream_args=None): max_tweets=max_tweets, **result_stream_args) return list(rs.stream()) - From 7c2b9de57d8ddaaaa489803ca15ea4f1306d8f5b Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sun, 28 Feb 2021 21:59:56 +0000 Subject: [PATCH 57/83] whitespace --- searchtweets/result_stream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index 9688286..7d5ced4 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -423,4 +423,4 @@ def collect_results(query, max_tweets=1000, result_stream_args=None): rs = ResultStream(request_parameters=query, max_tweets=max_tweets, **result_stream_args) - return list(rs.stream()) + return list(rs.stream()) \ No newline at end of file From b45dcb1ab0b4fd57e28dfc053c35735c1c91c4e0 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Mon, 1 Mar 2021 20:15:29 -0700 Subject: [PATCH 58/83] JM: updating default cred key to 'search_tweets_v2' --- README.rst | 6 +++--- searchtweets/credentials.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index d8934ea..6013735 100644 --- a/README.rst +++ b/README.rst @@ -119,7 +119,7 @@ Command-line options credentials. --credential-file-key CREDENTIAL_YAML_KEY the key in the credential file used for this session's - credentials. Defaults to search_tweets_api + credentials. Defaults to search_tweets_v2 --env-overwrite ENV_OVERWRITE Overwrite YAML-parsed credentials with any set environment variables. See API docs or readme for @@ -222,7 +222,7 @@ The simplest credential file should look like this: By default, this library expects this file at "~/.twitter_keys.yaml", but you can pass the relevant location as needed, either with the --credential-file flag for the command-line app or as demonstrated below in a Python program. -Both above examples require no special command-line arguments or in-program arguments. The credential parsing methods, unless otherwise specified, will look for a YAML key called search_tweets_api. +Both above examples require no special command-line arguments or in-program arguments. The credential parsing methods, unless otherwise specified, will look for a YAML key called search_tweets_v2. For developers who have multiple endpoints and/or search products, you can keep all credentials in the same file and specify specific keys to use. --credential-file-key specifies this behavior in the command line app. An example: @@ -396,7 +396,7 @@ Custom headers can be specified in a config file, under a specific credentials k .. code:: yaml - search_tweets_api: + search_tweets_v2: endpoint: bearer_token: extra_headers: diff --git a/searchtweets/credentials.py b/searchtweets/credentials.py index 309544c..cddfc5a 100644 --- a/searchtweets/credentials.py +++ b/searchtweets/credentials.py @@ -138,7 +138,7 @@ def load_credentials(filename=None, {'endpoint': 'https://endpoint'} """ - yaml_key = yaml_key if yaml_key is not None else "search_tweets_api" + yaml_key = yaml_key if yaml_key is not None else "search_tweets_v2" filename = "~/.twitter_keys.yaml" if filename is None else filename yaml_vars = _load_yaml_credentials(filename=filename, yaml_key=yaml_key) From 9656a8e3806ed02ef8a30631233d2a7a9fb2f7b9 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Mon, 1 Mar 2021 20:26:38 -0700 Subject: [PATCH 59/83] JM: bumping version for new package. --- searchtweets/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searchtweets/_version.py b/searchtweets/_version.py index f652fae..7c5168b 100644 --- a/searchtweets/_version.py +++ b/searchtweets/_version.py @@ -2,4 +2,4 @@ # Copyright 2020 Twitter, Inc. # Licensed under the MIT License # https://opensource.org/licenses/MIT -VERSION = "1.0.4" +VERSION = "1.0.7" From 35df66ac9e32deed86488eee7274298c42707296 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 27 Mar 2021 04:44:54 +0000 Subject: [PATCH 60/83] fix bug if no expansions are returned --- searchtweets/result_stream.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index 7d5ced4..574faaf 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -204,7 +204,7 @@ def extract_includes(expansion, _id="id"): """ Return empty objects for things missing in includes. """ - if expansion in self.includes: + if self.includes is not None and expansion in self.includes: return defaultdict( lambda: {}, {include[_id]: include for include in self.includes[expansion]}, @@ -423,4 +423,4 @@ def collect_results(query, max_tweets=1000, result_stream_args=None): rs = ResultStream(request_parameters=query, max_tweets=max_tweets, **result_stream_args) - return list(rs.stream()) \ No newline at end of file + return list(rs.stream()) From cdf7d1eb9bd3d300a19d8e25361920ccd7f010ad Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 27 Mar 2021 04:53:36 +0000 Subject: [PATCH 61/83] set default output to original API responses --- searchtweets/result_stream.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index 574faaf..6983e27 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -170,7 +170,7 @@ class ResultStream: session_request_counter = 0 def __init__(self, endpoint, request_parameters, bearer_token=None, extra_headers_dict=None, max_tweets=500, - max_requests=None, atomic=False, output_options="c", **kwargs): + max_requests=None, atomic=False, output_options="r", **kwargs): self.bearer_token = bearer_token self.extra_headers_dict = extra_headers_dict @@ -193,7 +193,6 @@ def __init__(self, endpoint, request_parameters, bearer_token=None, extra_header self.max_requests = (max_requests if max_requests is not None else 10 ** 9) self.endpoint = endpoint - # Tweet output format: 'a' - atomic, 'r' - response, 'c' - constructed if atomic: self.output_format = "a" self.output_format = output_options From 53d78e1de7bc203eb256f277b12444ed0070ed56 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 27 Mar 2021 04:59:40 +0000 Subject: [PATCH 62/83] rename output format options --- scripts/search_tweets.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/search_tweets.py b/scripts/search_tweets.py index cfb079e..8ff9031 100644 --- a/scripts/search_tweets.py +++ b/scripts/search_tweets.py @@ -129,7 +129,10 @@ def parse_cmd_args(): argparser.add_argument("--output-options", dest="output_options", default=None, - help="Set output options: 'a' - atomic, 'r' - response, 'c' - constructed") + help="""Set output format: + 'r' Unmodified API Responses. (default). + 'a' Atomic Tweets: Tweet objects with expansions inline. + 'm' Message Stream: Tweets, Expansions, and Metadata as a stream of messages.""") argparser.add_argument("--max-tweets", dest="max_tweets", type=int, From 465ff2e0fb20e51e7dd8894d106441e9d7fba818 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 27 Mar 2021 05:04:35 +0000 Subject: [PATCH 63/83] rename output format options --- searchtweets/result_stream.py | 37 ++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index 6983e27..5072019 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -280,38 +280,39 @@ def output_response_format(): yield self.current_response self.total_results += self.meta['result_count'] - def output_constructed_format(): + def output_atomic_format(): + """ + Format the results with "atomic" objects: + """ + for tweet in self.current_tweets: + if self.total_results >= self.max_tweets: + break + yield self._tweet_func(expand_payload(tweet)) + self.total_results += 1 + + def output_message_stream_format(): """ - output the way it was implemented originally + output as a stream of messages, + the way it was implemented originally """ - #Serve up data.tweets. + # Serve up data.tweets. for tweet in self.current_tweets: if self.total_results >= self.max_tweets: break yield self._tweet_func(tweet) self.total_results += 1 - #Serve up "includes" arrays + # Serve up "includes" arrays, this includes errors if self.includes != None: yield self.includes - #Serve up meta structure. + # Serve up meta structure. if self.meta != None: yield self.meta - def output_atomic_format(): - """ - Format the results with "atomic" objects: - """ - for tweet in self.current_tweets: - if self.total_results >= self.max_tweets: - break - yield self._tweet_func(expand_payload(tweet)) - self.total_results += 1 - - response_format = {"r": output_response_format, - "c": output_constructed_format, - "a": output_atomic_format} + response_format = {"r": output_response_format, + "a": output_atomic_format, + "m": output_message_stream_format} return response_format.get(self.output_format)() From 424a43025dde52cf67f3eb6fac3c9028600e06f8 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 27 Mar 2021 05:25:52 +0000 Subject: [PATCH 64/83] add output formats docs --- README.rst | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index d8934ea..a69a46f 100644 --- a/README.rst +++ b/README.rst @@ -33,6 +33,7 @@ Features - Supports Twitter API v2 'recent' and 'all' search. - Supports the configuration of v2 `expansions `_ and `fields `_. +- Supports multiple output formats: Original API responses (new default), as a stream of messages (previous default in versions <1.0.7), and new 'atomic' format with expansions included in tweets. - Supports a new "polling" mode using the ``since-id`` search request parameter. The ``since-id``, along with the new ``until-id`` provide a way to navigate the public Tweet archive by Tweet ID. - Supports additional ways to specify ``start-time`` and ``end-time`` request parameters: @@ -77,7 +78,7 @@ In this spirit of updating the parlance used, note that a core method provided b These expanded objects include Users, referenced Tweets, and attached media. In addition to the `data` and `includes` arrays, the search endpoint also provides a `meta` object that provides the max and min Tweet IDs included in the response, along with a `next_token` if there is another 'page' of data to request. -Currently, the v2 client returns the Tweets in the `data` array as individual (and atomic) JSON Tweet objects. This matches the behavior of the original search client. However, after yielding the individual Tweet objects, the client outputs arrays of User, Tweet, and media objects from the `includes` array, followed by the `meta` object. +Currently, the v2 client returns the original API responses. Optionally, it can output a stream of Tweet objects with all expansions included in each tweet. Alternatively, it can output a stream of messages, yielding the individual Tweet objects, arrays of User, Tweet, and media objects from the `includes` array, followed by the `meta` object. This matches the behavior of the original search client, and was the default output format in versions 1.0.7 and earlier. Finally, the original version of search-tweets-python used a `Tweet Parser `__ to help manage the differences between two different JSON formats ("original" and "Activity Stream"). With v2, there is just one version of Tweet JSON, so this Tweet Parser is not used. In the original code, this Tweet parser was envoked with a `tweetify=True directive. With this v2 version, this use of the Tweet Parser is turned off by instead using `tweetify=False`. @@ -110,6 +111,8 @@ Command-line options [--filename-prefix FILENAME_PREFIX] [--no-print-stream] [--print-stream] + [--output-options] + [--atomic] [--extra-headers EXTRA_HEADERS] [--debug] @@ -178,6 +181,13 @@ Command-line options stored. --no-print-stream disable print streaming --print-stream Print tweet stream to stdout + --output-options Set output format: + 'r' Unmodified API Responses. (default). + 'a' Atomic Tweets: Tweet objects with expansions inline. + 'm' Message Stream: Tweets, Expansions, and Metadata + as a stream of messages. + --atomic Output "Atomic" Tweet format. + Equivalent to setting --output-format 'a'. --extra-headers EXTRA_HEADERS JSON-formatted str representing a dict of additional HTTP request headers From b94df7b9f2a6f49c13c1c2b0c1b88dd9cf15f846 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Mon, 12 Apr 2021 17:42:52 -0600 Subject: [PATCH 65/83] Update README.rst --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 6013735..c6680ec 100644 --- a/README.rst +++ b/README.rst @@ -36,9 +36,9 @@ Features - Supports a new "polling" mode using the ``since-id`` search request parameter. The ``since-id``, along with the new ``until-id`` provide a way to navigate the public Tweet archive by Tweet ID. - Supports additional ways to specify ``start-time`` and ``end-time`` request parameters: - - d# - For example, 'd2' sets ``start-time`` to (exactly) two days ago. - - h# - For example, 'h12' sets ``start-time`` to (exactly) twelve hours ago. - - m# - For example, 'm15' sets ``start-time`` to (exactly) fifteen minutes ago. + - d# - For example, '2d' sets ``start-time`` to (exactly) two days ago. + - h# - For example, '12h' sets ``start-time`` to (exactly) twelve hours ago. + - m# - For example, '15m' sets ``start-time`` to (exactly) fifteen minutes ago. These are handy for kicking off searches with a backfill period, and also work with the ``end-time`` request parameter. From ec36e9f19a69804f6fec95cfe8427c6fad1e0f3f Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Tue, 13 Apr 2021 21:51:35 +0100 Subject: [PATCH 66/83] remove extra 'atomic' command line param --- scripts/search_tweets.py | 6 ------ searchtweets/result_stream.py | 4 +--- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/scripts/search_tweets.py b/scripts/search_tweets.py index 8ff9031..c63dd4b 100644 --- a/scripts/search_tweets.py +++ b/scripts/search_tweets.py @@ -120,12 +120,6 @@ def parse_cmd_args(): default=None, help="""A comma-delimited list of Twitter Poll JSON attributes to include in endpoint responses. (API default:"id")""") - argparser.add_argument("--atomic", - dest="atomic", - action="store_true", - default=False, - help="Inject 'includes' objects into Tweet objects.") - argparser.add_argument("--output-options", dest="output_options", default=None, diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index 5072019..88341b3 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -170,7 +170,7 @@ class ResultStream: session_request_counter = 0 def __init__(self, endpoint, request_parameters, bearer_token=None, extra_headers_dict=None, max_tweets=500, - max_requests=None, atomic=False, output_options="r", **kwargs): + max_requests=None, output_options="r", **kwargs): self.bearer_token = bearer_token self.extra_headers_dict = extra_headers_dict @@ -193,8 +193,6 @@ def __init__(self, endpoint, request_parameters, bearer_token=None, extra_header self.max_requests = (max_requests if max_requests is not None else 10 ** 9) self.endpoint = endpoint - if atomic: - self.output_format = "a" self.output_format = output_options def formatted_output(self): From 36aa3815b3ac09834988eb1a1f278605abe349cf Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Tue, 13 Apr 2021 21:53:32 +0100 Subject: [PATCH 67/83] rename output-options to output-format --- scripts/search_tweets.py | 4 ++-- searchtweets/result_stream.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/search_tweets.py b/scripts/search_tweets.py index c63dd4b..508ac66 100644 --- a/scripts/search_tweets.py +++ b/scripts/search_tweets.py @@ -120,8 +120,8 @@ def parse_cmd_args(): default=None, help="""A comma-delimited list of Twitter Poll JSON attributes to include in endpoint responses. (API default:"id")""") - argparser.add_argument("--output-options", - dest="output_options", + argparser.add_argument("--output-format", + dest="output_format", default=None, help="""Set output format: 'r' Unmodified API Responses. (default). diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index 88341b3..6904c4e 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -170,7 +170,7 @@ class ResultStream: session_request_counter = 0 def __init__(self, endpoint, request_parameters, bearer_token=None, extra_headers_dict=None, max_tweets=500, - max_requests=None, output_options="r", **kwargs): + max_requests=None, output_format="r", **kwargs): self.bearer_token = bearer_token self.extra_headers_dict = extra_headers_dict @@ -193,7 +193,7 @@ def __init__(self, endpoint, request_parameters, bearer_token=None, extra_header self.max_requests = (max_requests if max_requests is not None else 10 ** 9) self.endpoint = endpoint - self.output_format = output_options + self.output_format = output_format def formatted_output(self): From 9f79bfe582fae3da6412d2ccbe6a38a5a9fc07a6 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Tue, 13 Apr 2021 21:56:28 +0100 Subject: [PATCH 68/83] add missing output format to poll tweets script --- scripts/poll_tweets.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/poll_tweets.py b/scripts/poll_tweets.py index 6457300..7d0870a 100644 --- a/scripts/poll_tweets.py +++ b/scripts/poll_tweets.py @@ -123,6 +123,14 @@ def parse_cmd_args(): default=None, help="""A comma-delimited list of Twitter Poll JSON attributes to include in endpoint responses. (API default:"id")""") + argparser.add_argument("--output-format", + dest="output_format", + default=None, + help="""Set output format: + 'r' Unmodified API Responses. (default). + 'a' Atomic Tweets: Tweet objects with expansions inline. + 'm' Message Stream: Tweets, Expansions, and Metadata as a stream of messages.""") + #client options. argparser.add_argument("--max-tweets", dest="max_tweets", type=int, From 6ae2d7aa2838a900ff426734f4c46628002837f4 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Tue, 13 Apr 2021 15:43:18 -0600 Subject: [PATCH 69/83] Adding time.sleep(1) to work with all search rate limit of one request per second. Code is too fast ;) --- searchtweets/result_stream.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index 5072019..596bdc4 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -343,7 +343,15 @@ def stream(self): {"next_token": self.next_token}) logger.info("paging; total requests read so far: {}" .format(self.n_requests)) + + #If hitting the "all" search endpoint, wait one second since that endpoint is currently + #limited to one request per sleep. + #Revisit and make configurable when the requests-per-second gets revisited. + if "tweets/search/all" in self.endpoint: + time.sleep(1) + self.execute_request() + else: break From 284ef79d244af7de0f7059a3626e64a61120cc07 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Tue, 13 Apr 2021 22:45:59 +0100 Subject: [PATCH 70/83] fix command line output-format option --- scripts/poll_tweets.py | 6 +++--- scripts/search_tweets.py | 10 ++++------ searchtweets/api_utils.py | 1 + 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/scripts/poll_tweets.py b/scripts/poll_tweets.py index 7d0870a..c8ab9e8 100644 --- a/scripts/poll_tweets.py +++ b/scripts/poll_tweets.py @@ -39,10 +39,10 @@ def parse_cmd_args(): argparser.add_argument("--credential-file-key", dest="credential_yaml_key", - default=None, + default="search_tweets_v2", help=("the key in the credential file used " "for this session's credentials. " - "Defaults to search_tweets_api")) + "Defaults to search_tweets_v2")) argparser.add_argument("--env-overwrite", dest="env_overwrite", @@ -125,7 +125,7 @@ def parse_cmd_args(): argparser.add_argument("--output-format", dest="output_format", - default=None, + default="r", help="""Set output format: 'r' Unmodified API Responses. (default). 'a' Atomic Tweets: Tweet objects with expansions inline. diff --git a/scripts/search_tweets.py b/scripts/search_tweets.py index 508ac66..9a808e2 100644 --- a/scripts/search_tweets.py +++ b/scripts/search_tweets.py @@ -91,9 +91,9 @@ def parse_cmd_args(): "'max_results' in the API") argparser.add_argument("--expansions", - dest="expansions", - default=None, - help="""A comma-delimited list of expansions. Specified expansions results in full objects in the 'includes' response object.""") + dest="expansions", + default=None, + help="""A comma-delimited list of expansions. Specified expansions results in full objects in the 'includes' response object.""") argparser.add_argument("--tweet-fields", dest="tweet_fields", @@ -122,7 +122,7 @@ def parse_cmd_args(): argparser.add_argument("--output-format", dest="output_format", - default=None, + default="r", help="""Set output format: 'r' Unmodified API Responses. (default). 'a' Atomic Tweets: Tweet objects with expansions inline. @@ -161,8 +161,6 @@ def parse_cmd_args(): default=True, help="Print tweet stream to stdout") - - argparser.add_argument("--extra-headers", dest="extra_headers", type=str, diff --git a/searchtweets/api_utils.py b/searchtweets/api_utils.py index ca9b8cc..3515a6f 100644 --- a/searchtweets/api_utils.py +++ b/searchtweets/api_utils.py @@ -187,6 +187,7 @@ def intify(arg): "bearer_token": config_dict.get("bearer_token"), "extra_headers_dict": config_dict.get("extra_headers_dict",None), "request_parameters": query, + "output_format": config_dict.get("output_format"), "results_per_file": intify(config_dict.get("results_per_file")), "max_tweets": intify(config_dict.get("max_tweets")), "max_pages": intify(config_dict.get("max_pages", None))} From fa24d862b0948029307f4c0166be43859911ce3f Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Tue, 13 Apr 2021 22:50:35 +0100 Subject: [PATCH 71/83] update docs --- README.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.rst b/README.rst index a69a46f..55b7413 100644 --- a/README.rst +++ b/README.rst @@ -181,13 +181,11 @@ Command-line options stored. --no-print-stream disable print streaming --print-stream Print tweet stream to stdout - --output-options Set output format: + --output-format Set output format: 'r' Unmodified API Responses. (default). 'a' Atomic Tweets: Tweet objects with expansions inline. 'm' Message Stream: Tweets, Expansions, and Metadata as a stream of messages. - --atomic Output "Atomic" Tweet format. - Equivalent to setting --output-format 'a'. --extra-headers EXTRA_HEADERS JSON-formatted str representing a dict of additional HTTP request headers From a599e5469b1275d6e17ff0376fe71fb65066bdf8 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Tue, 13 Apr 2021 16:50:00 -0600 Subject: [PATCH 72/83] JM: passing new '--output-option' setting to dictionary. --- searchtweets/api_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/searchtweets/api_utils.py b/searchtweets/api_utils.py index ca9b8cc..23e93ff 100644 --- a/searchtweets/api_utils.py +++ b/searchtweets/api_utils.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 2020 Twitter, Inc. +# Copyright 2021 Twitter, Inc. # Licensed under the MIT License # https://opensource.org/licenses/MIT """ @@ -189,7 +189,8 @@ def intify(arg): "request_parameters": query, "results_per_file": intify(config_dict.get("results_per_file")), "max_tweets": intify(config_dict.get("max_tweets")), - "max_pages": intify(config_dict.get("max_pages", None))} + "max_pages": intify(config_dict.get("max_pages", None)), + "output_option": config_dict.get("output_option")} return _dict From 9396f05f78cee04aca70c622ad84829a2895ab01 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Tue, 13 Apr 2021 16:51:50 -0600 Subject: [PATCH 73/83] JM: removint --atomic as standalone option, made '--output-options' singular. --- scripts/search_tweets.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/scripts/search_tweets.py b/scripts/search_tweets.py index 8ff9031..6f72d2e 100644 --- a/scripts/search_tweets.py +++ b/scripts/search_tweets.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# Copyright 2020 Twitter, Inc. +# Copyright 2021 Twitter, Inc. # Licensed under the Apache License, Version 2.0 # http://www.apache.org/licenses/LICENSE-2.0 import os @@ -120,19 +120,13 @@ def parse_cmd_args(): default=None, help="""A comma-delimited list of Twitter Poll JSON attributes to include in endpoint responses. (API default:"id")""") - argparser.add_argument("--atomic", - dest="atomic", - action="store_true", - default=False, - help="Inject 'includes' objects into Tweet objects.") - - argparser.add_argument("--output-options", - dest="output_options", + argparser.add_argument("--output-option", + dest="output_option", default=None, help="""Set output format: - 'r' Unmodified API Responses. (default). - 'a' Atomic Tweets: Tweet objects with expansions inline. - 'm' Message Stream: Tweets, Expansions, and Metadata as a stream of messages.""") + 'r' Unmodified API [R]esponses. (default). + 'a' [A]tomic Tweets: Tweet objects with expansions inline. + 'm' [M]essage stream: Tweets, expansions, and pagination metadata as a stream of messages.""") argparser.add_argument("--max-tweets", dest="max_tweets", type=int, From 1cb4fbe5f6a367cd6507df6e7fbbc5afced8260b Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Wed, 14 Apr 2021 12:16:59 +0100 Subject: [PATCH 74/83] docstring consistency --- scripts/poll_tweets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/poll_tweets.py b/scripts/poll_tweets.py index c8ab9e8..6652673 100644 --- a/scripts/poll_tweets.py +++ b/scripts/poll_tweets.py @@ -127,9 +127,9 @@ def parse_cmd_args(): dest="output_format", default="r", help="""Set output format: - 'r' Unmodified API Responses. (default). - 'a' Atomic Tweets: Tweet objects with expansions inline. - 'm' Message Stream: Tweets, Expansions, and Metadata as a stream of messages.""") + 'r' Unmodified API [R]esponses. (default). + 'a' [A]tomic Tweets: Tweet objects with expansions inline. + 'm' [M]essage stream: Tweets, expansions, and pagination metadata as a stream of messages.""") #client options. argparser.add_argument("--max-tweets", dest="max_tweets", From 4cbcb5c650d67d9b46e11409e5607d624149032f Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Wed, 14 Apr 2021 12:17:51 +0100 Subject: [PATCH 75/83] rename output-option to output-format --- searchtweets/api_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/searchtweets/api_utils.py b/searchtweets/api_utils.py index 099d5be..d9c3f1a 100644 --- a/searchtweets/api_utils.py +++ b/searchtweets/api_utils.py @@ -187,11 +187,10 @@ def intify(arg): "bearer_token": config_dict.get("bearer_token"), "extra_headers_dict": config_dict.get("extra_headers_dict",None), "request_parameters": query, - "output_format": config_dict.get("output_format"), "results_per_file": intify(config_dict.get("results_per_file")), "max_tweets": intify(config_dict.get("max_tweets")), "max_pages": intify(config_dict.get("max_pages", None)), - "output_option": config_dict.get("output_option")} + "output_format": config_dict.get("output_format")} return _dict From f667111ba6e0647077ee6401a5e5e3f113c50042 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Wed, 14 Apr 2021 13:13:40 -0600 Subject: [PATCH 76/83] Update README.rst --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 59516f4..4e78a21 100644 --- a/README.rst +++ b/README.rst @@ -37,9 +37,9 @@ Features - Supports a new "polling" mode using the ``since-id`` search request parameter. The ``since-id``, along with the new ``until-id`` provide a way to navigate the public Tweet archive by Tweet ID. - Supports additional ways to specify ``start-time`` and ``end-time`` request parameters: - - d# - For example, '2d' sets ``start-time`` to (exactly) two days ago. - - h# - For example, '12h' sets ``start-time`` to (exactly) twelve hours ago. - - m# - For example, '15m' sets ``start-time`` to (exactly) fifteen minutes ago. + - #d - For example, '2d' sets ``start-time`` to (exactly) two days ago. + - #h - For example, '12h' sets ``start-time`` to (exactly) twelve hours ago. + - #m - For example, '15m' sets ``start-time`` to (exactly) fifteen minutes ago. These are handy for kicking off searches with a backfill period, and also work with the ``end-time`` request parameter. From b96059a65fd61172b28c585a4ba0a6fd2b643fc5 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Wed, 14 Apr 2021 13:59:00 -0600 Subject: [PATCH 77/83] Updating --help output --- README.rst | 158 +++++++++++++++++++---------------------------------- 1 file changed, 56 insertions(+), 102 deletions(-) diff --git a/README.rst b/README.rst index 4e78a21..77a25e9 100644 --- a/README.rst +++ b/README.rst @@ -87,109 +87,63 @@ In the original code, this Tweet parser was envoked with a `tweetify=True direct Command-line options ==================== -.. code:: bash - - usage: search_tweets.py [-h] [--credential-file CREDENTIAL_FILE] - [--credential-file-key CREDENTIAL_YAML_KEY] - [--env-overwrite ENV_OVERWRITE] - [--config-file CONFIG_FILENAME] - [--query QUERY] - [--start-time START_TIME] - [--end-time END_TIME] - [--since-id SINCE_ID] - [--until-id UNTIL_ID] - [--results-per-call RESULTS_PER_CALL] - [--expansions EXPANSIONS] - [--tweet-fields TWEET_FIELDS] - [--user-fields USER_FIELDS] - [--media-fields MEDIA_FIELDS] - [--place-fields PLACE_FIELDS] - [--poll-fields POLL_FIELDS] - [--max-tweets MAX_TWEETS] - [--max-pages MAX_PAGES] - [--results-per-file RESULTS_PER_FILE] - [--filename-prefix FILENAME_PREFIX] - [--no-print-stream] - [--print-stream] - [--output-options] - [--atomic] - [--extra-headers EXTRA_HEADERS] - [--debug] - - -h, --help show this help message and exit - --credential-file CREDENTIAL_FILE - Location of the yaml file used to hold your - credentials. - --credential-file-key CREDENTIAL_YAML_KEY - the key in the credential file used for this session's - credentials. Defaults to search_tweets_v2 - --env-overwrite ENV_OVERWRITE - Overwrite YAML-parsed credentials with any set - environment variables. See API docs or readme for - details. - --config-file CONFIG_FILENAME - configuration file with all parameters. Far, easier to - use than the command-line args version., If a valid - file is found, all args will be populated, from there. - Remaining command-line args, will overrule args found - in the config, file. - --query QUERY Search query. (See: - https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-rule) - --start-time START_TIME - Start of datetime window, format 'YYYY-mm-DDTHH:MM' - (default: -7 days for /recent, -30 days for /all) - --end-time END_TIME End of datetime window, format 'YYYY-mm-DDTHH:MM' - (default: to 30 seconds before request time) - --since-id SINCE_ID Tweet ID, will start search from Tweets after this - one. (See: - https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/paginate) - --until-id UNTIL_ID Tweet ID, will end search from Tweets before this one. - (See: - https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/paginate) - --results-per-call RESULTS_PER_CALL - Number of results to return per call (default 10; max - 100) - corresponds to 'max_results' in the API - --expansions EXPANSIONS - A comma-delimited list of object expansions to include - in endpoint responses. (API default: "") - --tweet-fields TWEET_FIELDS - A comma-delimited list of Tweet JSON attributions to - include in endpoint responses. (API default: "id, text") - --user-fields USER_FIELDS - A comma-delimited list of user JSON attributions to - include in endpoint responses. (API default: "id") - --media-fields MEDIA_FIELDS - A comma-delimited list of media JSON attributions to - include in endpoint responses. (API default: "id") - --place-fields PLACE_FIELDS - A comma-delimited list of Twitter Place JSON - attributions to include in endpoint responses. (API - default: "id") - --poll-fields POLL_FIELDS - A comma-delimited list of Tweet Poll JSON attributions - to include in endpoint responses. (API default: "id") - --max-tweets MAX_TWEETS - Maximum number of Tweets to return for this session of - requests. - --max-pages MAX_PAGES - Maximum number of pages/API calls to use for this - session. - --results-per-file RESULTS_PER_FILE +.. code:: + +usage: search_tweets.py + [-h] [--credential-file CREDENTIAL_FILE] [--credential-file-key CREDENTIAL_YAML_KEY] [--env-overwrite ENV_OVERWRITE] [--config-file CONFIG_FILENAME] [--query QUERY] + [--start-time START_TIME] [--end-time END_TIME] [--since-id SINCE_ID] [--until-id UNTIL_ID] [--results-per-call RESULTS_PER_CALL] [--expansions EXPANSIONS] + [--tweet-fields TWEET_FIELDS] [--user-fields USER_FIELDS] [--media-fields MEDIA_FIELDS] [--place-fields PLACE_FIELDS] [--poll-fields POLL_FIELDS] + [--output-format OUTPUT_FORMAT] [--max-tweets MAX_TWEETS] [--max-pages MAX_PAGES] [--results-per-file RESULTS_PER_FILE] [--filename-prefix FILENAME_PREFIX] + [--no-print-stream] [--print-stream] [--extra-headers EXTRA_HEADERS] [--debug] + +optional arguments: + -h, --help show this help message and exit + --credential-file CREDENTIAL_FILE + Location of the yaml file used to hold your credentials. + --credential-file-key CREDENTIAL_YAML_KEY + the key in the credential file used for this session's credentials. Defaults to search_tweets_v2 + --env-overwrite ENV_OVERWRITE + Overwrite YAML-parsed credentials with any set environment variables. See API docs or readme for details. + --config-file CONFIG_FILENAME + configuration file with all parameters. Far, easier to use than the command-line args version., If a valid file is found, all args will be populated, from there. Remaining + command-line args, will overrule args found in the config, file. + --query QUERY Search query. (See: https://developer.twitter.com/en/docs/labs/recent-search/guides/search-queries) + --start-time START_TIME + Start of datetime window, format 'YYYY-mm-DDTHH:MM' (default: -7 days for /recent, -30 days for /all) + --end-time END_TIME End of datetime window, format 'YYYY-mm-DDTHH:MM' (default: to 30 seconds before request time) + --since-id SINCE_ID Tweet ID, will start search from Tweets after this one. (See: https://developer.twitter.com/en/docs/labs/recent-search/guides/pagination) + --until-id UNTIL_ID Tweet ID, will end search from Tweets before this one. (See: https://developer.twitter.com/en/docs/labs/recent-search/guides/pagination) + --results-per-call RESULTS_PER_CALL + Number of results to return per call (default 10; max 100) - corresponds to 'max_results' in the API + --expansions EXPANSIONS + A comma-delimited list of expansions. Specified expansions results in full objects in the 'includes' response object. + --tweet-fields TWEET_FIELDS + A comma-delimited list of Tweet JSON attributes to include in endpoint responses. (API default:"id,text") + --user-fields USER_FIELDS + A comma-delimited list of User JSON attributes to include in endpoint responses. (API default:"id") + --media-fields MEDIA_FIELDS + A comma-delimited list of media JSON attributes to include in endpoint responses. (API default:"id") + --place-fields PLACE_FIELDS + A comma-delimited list of Twitter Place JSON attributes to include in endpoint responses. (API default:"id") + --poll-fields POLL_FIELDS + A comma-delimited list of Twitter Poll JSON attributes to include in endpoint responses. (API default:"id") + --output-format OUTPUT_FORMAT + Set output format: 'r' Unmodified API [R]esponses. (default). 'a' [A]tomic Tweets: Tweet objects with expansions inline. 'm' [M]essage stream: Tweets, expansions, and + pagination metadata as a stream of messages. + --max-tweets MAX_TWEETS + Maximum number of Tweets to return for this session of requests. + --max-pages MAX_PAGES + Maximum number of pages/API calls to use for this session. + --results-per-file RESULTS_PER_FILE Maximum tweets to save per file. - --filename-prefix FILENAME_PREFIX - prefix for the filename where tweet json data will be - stored. - --no-print-stream disable print streaming - --print-stream Print tweet stream to stdout - --output-format Set output format: - 'r' Unmodified API Responses. (default). - 'a' Atomic Tweets: Tweet objects with expansions inline. - 'm' Message Stream: Tweets, Expansions, and Metadata - as a stream of messages. - --extra-headers EXTRA_HEADERS - JSON-formatted str representing a dict of additional - HTTP request headers - --debug print all info and warning messages + --filename-prefix FILENAME_PREFIX + prefix for the filename where tweet json data will be stored. + --no-print-stream disable print streaming + --print-stream Print tweet stream to stdout + --extra-headers EXTRA_HEADERS + JSON-formatted str representing a dict of additional HTTP request headers + --debug print all info and warning messages + Installation From 23e4400687bc967dbff313a7d66e57095c0939a5 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Thu, 15 Apr 2021 16:23:25 -0600 Subject: [PATCH 78/83] Update README.rst --- README.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 77a25e9..64dd40d 100644 --- a/README.rst +++ b/README.rst @@ -33,7 +33,10 @@ Features - Supports Twitter API v2 'recent' and 'all' search. - Supports the configuration of v2 `expansions `_ and `fields `_. -- Supports multiple output formats: Original API responses (new default), as a stream of messages (previous default in versions <1.0.7), and new 'atomic' format with expansions included in tweets. +- Supports multiple output formats: + * Original API responses (new default) + * Stream of messages (previous default in versions <1.0.7) + * New 'atomic' format with expansions included in tweets. - Supports a new "polling" mode using the ``since-id`` search request parameter. The ``since-id``, along with the new ``until-id`` provide a way to navigate the public Tweet archive by Tweet ID. - Supports additional ways to specify ``start-time`` and ``end-time`` request parameters: From 0888d2e53f17e732833977572205fadcccce0ca6 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 20 Apr 2021 14:59:33 -0600 Subject: [PATCH 79/83] Cleaning out some legacy non-v2 details --- examples/api_example.ipynb | 182 +++---------------------------------- 1 file changed, 14 insertions(+), 168 deletions(-) diff --git a/examples/api_example.ipynb b/examples/api_example.ipynb index a75aa62..ed9a65a 100644 --- a/examples/api_example.ipynb +++ b/examples/api_example.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Working with the API within a Python program is straightforward both for Premium and Enterprise clients.\n", + "Working with the API within a Python program is straightforward for the v2 client.\n", "\n", "We'll assume that credentials are in the default location, `~/.twitter_keys.yaml`." ] @@ -17,7 +17,7 @@ }, "outputs": [], "source": [ - "from searchtweets import ResultStream, gen_rule_payload, load_credentials" + "from searchtweets import ResultStream, gen_request_parameters, load_credentials" ] }, { @@ -36,7 +36,7 @@ "outputs": [], "source": [ "enterprise_search_args = load_credentials(\"~/.twitter_keys.yaml\",\n", - " yaml_key=\"search_tweets_enterprise\",\n", + " yaml_key=\"search_tweets_v2\",\n", " env_overwrite=False)" ] }, @@ -64,7 +64,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There is a function that formats search API rules into valid json queries called `gen_rule_payload`. It has sensible defaults, such as pulling more Tweets per call than the default 100 (but note that a sandbox environment can only have a max of 100 here, so if you get errors, please check this) not including dates, and defaulting to hourly counts when using the counts api. Discussing the finer points of generating search rules is out of scope for these examples; I encourage you to see the docs to learn the nuances within, but for now let's see what a rule looks like." + "There is a function that formats search API rules into valid json queries called `gen_request_parameters`. It has sensible defaults, such as pulling more Tweets per call than the default 10 and not including dates. Discussing the finer points of generating search rules is out of scope for these examples; I encourage you to see the docs to learn the nuances within, but for now let's see what a rule looks like." ] }, { @@ -81,7 +81,7 @@ } ], "source": [ - "rule = gen_rule_payload(\"beyonce\", results_per_call=100) # testing with a sandbox account\n", + "query = gen_request_parameters(\"beyonce\", results_per_call=100) # testing with a sandbox account\n", "print(rule)" ] }, @@ -89,7 +89,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This rule will match tweets that have the text `beyonce` in them." + "This query will match tweets that have the text `beyonce` in them." ] }, { @@ -101,11 +101,11 @@ "\n", "## Fast Way\n", "\n", - "We'll use the `search_args` variable to power the configuration point for the API. The object also takes a valid PowerTrack rule and has options to cutoff search when hitting limits on both number of Tweets and API calls.\n", + "We'll use the `search_args` variable to power the configuration point for the API. The object also takes a valid query and has options to cutoff search when hitting limits on both number of Tweets and API calls.\n", "\n", "We'll be using the `collect_results` function, which has three parameters.\n", "\n", - "- rule: a valid PowerTrack rule, referenced earlier\n", + "- query: a valid search query, referenced earlier\n", "- max_results: as the API handles pagination, it will stop collecting when we get to this number\n", "- result_stream_args: configuration args that we've already specified.\n", "\n", @@ -135,7 +135,7 @@ }, "outputs": [], "source": [ - "tweets = collect_results(rule,\n", + "tweets = collect_results(query,\n", " max_results=100,\n", " result_stream_args=enterprise_search_args) # change this if you need to" ] @@ -261,13 +261,13 @@ "ResultStream: \n", "\t{\n", " \"username\":null,\n", - " \"endpoint\":\"https:\\/\\/api.twitter.com\\/1.1\\/tweets\\/search\\/30day\\/dev.json\",\n", + " \"endpoint\":\"https:\\/\\/api.twitter.com\\/2\\/tweets\\/search\\/recent\",\n", " \"rule_payload\":{\n", " \"query\":\"beyonce\",\n", " \"maxResults\":100\n", " },\n", - " \"tweetify\":true,\n", - " \"max_results\":500\n", + " \"tweetify\":false,\n", + " \"max_results\":100\n", "}\n" ] } @@ -335,90 +335,6 @@ "[print(tweet.all_text) for tweet in tweets[0:10]];" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Counts Endpoint\n", - "\n", - "We can also use the Search API Counts endpoint to get counts of Tweets that match our rule. Each request will return up to *30* results, and each count request can be done on a minutely, hourly, or daily basis. The underlying `ResultStream` object will handle converting your endpoint to the count endpoint, and you have to specify the `count_bucket` argument when making a rule to use it.\n", - "\n", - "The process is very similar to grabbing Tweets, but has some minor differences.\n", - "\n", - "\n", - "_Caveat - premium sandbox environments do NOT have access to the Search API counts endpoint._" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "count_rule = gen_rule_payload(\"beyonce\", count_bucket=\"day\")\n", - "\n", - "counts = collect_results(count_rule, result_stream_args=enterprise_search_args)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our results are pretty straightforward and can be rapidly used." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'count': 366, 'timePeriod': '201801170000'},\n", - " {'count': 44580, 'timePeriod': '201801160000'},\n", - " {'count': 61932, 'timePeriod': '201801150000'},\n", - " {'count': 59678, 'timePeriod': '201801140000'},\n", - " {'count': 44014, 'timePeriod': '201801130000'},\n", - " {'count': 46607, 'timePeriod': '201801120000'},\n", - " {'count': 41523, 'timePeriod': '201801110000'},\n", - " {'count': 47056, 'timePeriod': '201801100000'},\n", - " {'count': 65506, 'timePeriod': '201801090000'},\n", - " {'count': 95251, 'timePeriod': '201801080000'},\n", - " {'count': 162883, 'timePeriod': '201801070000'},\n", - " {'count': 106344, 'timePeriod': '201801060000'},\n", - " {'count': 93542, 'timePeriod': '201801050000'},\n", - " {'count': 110415, 'timePeriod': '201801040000'},\n", - " {'count': 127523, 'timePeriod': '201801030000'},\n", - " {'count': 131952, 'timePeriod': '201801020000'},\n", - " {'count': 176157, 'timePeriod': '201801010000'},\n", - " {'count': 57229, 'timePeriod': '201712310000'},\n", - " {'count': 72277, 'timePeriod': '201712300000'},\n", - " {'count': 72051, 'timePeriod': '201712290000'},\n", - " {'count': 76371, 'timePeriod': '201712280000'},\n", - " {'count': 61578, 'timePeriod': '201712270000'},\n", - " {'count': 55118, 'timePeriod': '201712260000'},\n", - " {'count': 59115, 'timePeriod': '201712250000'},\n", - " {'count': 106219, 'timePeriod': '201712240000'},\n", - " {'count': 114732, 'timePeriod': '201712230000'},\n", - " {'count': 73327, 'timePeriod': '201712220000'},\n", - " {'count': 89171, 'timePeriod': '201712210000'},\n", - " {'count': 192381, 'timePeriod': '201712200000'},\n", - " {'count': 85554, 'timePeriod': '201712190000'},\n", - " {'count': 57829, 'timePeriod': '201712180000'}]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "counts" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -516,84 +432,14 @@ } ], "source": [ - "rule = gen_rule_payload(\"from:jack\",\n", + "query = gen_request_parameters(\"from:jack\",\n", " from_date=\"2017-09-20\",\n", " to_date=\"2017-10-30\",\n", " count_bucket=\"day\",\n", " results_per_call=500)\n", - "print(rule)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "counts = collect_results(rule, max_results=500, result_stream_args=enterprise_search_args)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'timePeriod': '201710290000', 'count': 0}\n", - "{'timePeriod': '201710280000', 'count': 0}\n", - "{'timePeriod': '201710270000', 'count': 3}\n", - "{'timePeriod': '201710260000', 'count': 6}\n", - "{'timePeriod': '201710250000', 'count': 4}\n", - "{'timePeriod': '201710240000', 'count': 4}\n", - "{'timePeriod': '201710230000', 'count': 0}\n", - "{'timePeriod': '201710220000', 'count': 0}\n", - "{'timePeriod': '201710210000', 'count': 3}\n", - "{'timePeriod': '201710200000', 'count': 2}\n", - "{'timePeriod': '201710190000', 'count': 1}\n", - "{'timePeriod': '201710180000', 'count': 6}\n", - "{'timePeriod': '201710170000', 'count': 2}\n", - "{'timePeriod': '201710160000', 'count': 2}\n", - "{'timePeriod': '201710150000', 'count': 1}\n", - "{'timePeriod': '201710140000', 'count': 64}\n", - "{'timePeriod': '201710130000', 'count': 3}\n", - "{'timePeriod': '201710120000', 'count': 4}\n", - "{'timePeriod': '201710110000', 'count': 8}\n", - "{'timePeriod': '201710100000', 'count': 4}\n", - "{'timePeriod': '201710090000', 'count': 1}\n", - "{'timePeriod': '201710080000', 'count': 0}\n", - "{'timePeriod': '201710070000', 'count': 0}\n", - "{'timePeriod': '201710060000', 'count': 1}\n", - "{'timePeriod': '201710050000', 'count': 3}\n", - "{'timePeriod': '201710040000', 'count': 5}\n", - "{'timePeriod': '201710030000', 'count': 8}\n", - "{'timePeriod': '201710020000', 'count': 5}\n", - "{'timePeriod': '201710010000', 'count': 0}\n", - "{'timePeriod': '201709300000', 'count': 0}\n", - "{'timePeriod': '201709290000', 'count': 0}\n", - "{'timePeriod': '201709280000', 'count': 9}\n", - "{'timePeriod': '201709270000', 'count': 41}\n", - "{'timePeriod': '201709260000', 'count': 13}\n", - "{'timePeriod': '201709250000', 'count': 6}\n", - "{'timePeriod': '201709240000', 'count': 7}\n", - "{'timePeriod': '201709230000', 'count': 3}\n", - "{'timePeriod': '201709220000', 'count': 0}\n", - "{'timePeriod': '201709210000', 'count': 1}\n", - "{'timePeriod': '201709200000', 'count': 7}\n" - ] - } - ], - "source": [ - "[print(c) for c in counts];" + "print(query)" ] } - ], "metadata": { "kernelspec": { "display_name": "Python 3", From 4158f9e73765f68b8e710d26bb97229c804d8469 Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 20 Apr 2021 15:02:08 -0600 Subject: [PATCH 80/83] Update api_example.ipynb --- examples/api_example.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/api_example.ipynb b/examples/api_example.ipynb index ed9a65a..cc25c75 100644 --- a/examples/api_example.ipynb +++ b/examples/api_example.ipynb @@ -439,7 +439,7 @@ " results_per_call=500)\n", "print(query)" ] - } + }, "metadata": { "kernelspec": { "display_name": "Python 3", From f3d1deb34ba9fd14d5714e201555397e988c2acb Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 20 Apr 2021 15:03:02 -0600 Subject: [PATCH 81/83] Update api_example.ipynb --- examples/api_example.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/api_example.ipynb b/examples/api_example.ipynb index cc25c75..59fdee6 100644 --- a/examples/api_example.ipynb +++ b/examples/api_example.ipynb @@ -439,7 +439,7 @@ " results_per_call=500)\n", "print(query)" ] - }, + }], "metadata": { "kernelspec": { "display_name": "Python 3", From f5005adde89e725a0126e2ab3fe47c5a03e3ee2f Mon Sep 17 00:00:00 2001 From: "@snowman" Date: Tue, 20 Apr 2021 15:08:47 -0600 Subject: [PATCH 82/83] more clean-up --- examples/api_example.ipynb | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/examples/api_example.ipynb b/examples/api_example.ipynb index 59fdee6..cd5b31c 100644 --- a/examples/api_example.ipynb +++ b/examples/api_example.ipynb @@ -24,7 +24,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Enterprise setup" + "## v2 setup" ] }, { @@ -35,31 +35,11 @@ }, "outputs": [], "source": [ - "enterprise_search_args = load_credentials(\"~/.twitter_keys.yaml\",\n", + "v2_search_args = load_credentials(\"~/.twitter_keys.yaml\",\n", " yaml_key=\"search_tweets_v2\",\n", " env_overwrite=False)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Premium Setup\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "premium_search_args = load_credentials(\"~/.twitter_keys.yaml\",\n", - " yaml_key=\"search_tweets_premium\",\n", - " env_overwrite=False)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -137,7 +117,7 @@ "source": [ "tweets = collect_results(query,\n", " max_results=100,\n", - " result_stream_args=enterprise_search_args) # change this if you need to" + " result_stream_args=v2_search_args) # change this if you need to" ] }, { From a7d624723052c9783d17f3ecb5491631654b5e60 Mon Sep 17 00:00:00 2001 From: Jim Moffitt Date: Thu, 24 Jun 2021 12:42:31 -0600 Subject: [PATCH 83/83] JM: updates to support 'counts' requests. --- scripts/search_tweets.py | 7 ++++ searchtweets/api_utils.py | 61 ++++++++++++++++++++++++++++++++--- searchtweets/result_stream.py | 51 ++++++++++++++++++++++++----- 3 files changed, 107 insertions(+), 12 deletions(-) diff --git a/scripts/search_tweets.py b/scripts/search_tweets.py index e103429..4cba8eb 100644 --- a/scripts/search_tweets.py +++ b/scripts/search_tweets.py @@ -62,6 +62,13 @@ def parse_cmd_args(): default=None, help="Search query. (See: https://developer.twitter.com/en/docs/labs/recent-search/guides/search-queries)") + #Use of this command triggers a search count request. + argparser.add_argument("--granularity", + dest="granularity", + default=None, + help=("""Set this to make a 'counts' request. 'Bucket' size for the search counts API. Options: + day, hour, minute. Aligned to midnight UTC.""")) + argparser.add_argument("--start-time", dest="start_time", default=None, diff --git a/searchtweets/api_utils.py b/searchtweets/api_utils.py index d9c3f1a..5e0ba3a 100644 --- a/searchtweets/api_utils.py +++ b/searchtweets/api_utils.py @@ -18,6 +18,9 @@ __all__ = ["gen_request_parameters", "gen_params_from_config", + "infer_endpoint", + "change_to_count_endpoint", + "validate_count_api", "convert_utc_time"] logger = logging.getLogger(__name__) @@ -81,7 +84,7 @@ def convert_utc_time(datetime_str): return _date.strftime("%Y-%m-%dT%H:%M:%SZ") -def gen_request_parameters(query, results_per_call=None, +def gen_request_parameters(query, granularity, results_per_call=None, start_time=None, end_time=None, since_id=None, until_id=None, tweet_fields=None, user_fields=None, media_fields=None, place_fields=None, poll_fields=None, @@ -142,9 +145,45 @@ def gen_request_parameters(query, results_per_call=None, payload["poll.fields"] = poll_fields if expansions: payload["expansions"] = expansions + if granularity: + payload["granularity"] = granularity return json.dumps(payload) if stringify else payload +def infer_endpoint(request_parameters): + """ + Infer which endpoint should be used for a given rule payload. + """ + if 'granularity' in request_parameters.keys(): + return 'counts' + else: + return 'search' #TODO: else "Tweets" makes more sense? + +def change_to_count_endpoint(endpoint): + """Utility function to change a normal 'get Tweets' endpoint to a ``count`` api + endpoint. Returns the same endpoint if it's already a valid count endpoint. + Args: + endpoint (str): your api endpoint + Returns: + str: the modified endpoint for a count endpoint. + + Recent search Tweet endpoint: https://api.twitter.com/2/tweets/search/recent + Recent search Counts endpoint: https://api.twitter.com/2/tweets/counts/recent + + FAS Tweet endpoint: https://api.twitter.com/2/tweets/search/all + FAS Counts endpoint: https://api.twitter.com/2/tweets/counts/all + + """ + if 'counts' in endpoint: + return endpoint + else: #Add in counts to endpoint URL. #TODO: update to *build* URL by injecting 'counts' to handle FAS. + #Insert 'counts' token as the second to last token. + #tokens = filter(lambda x: x != '', re.split("[/:]", endpoint)) + tokens = endpoint.split('/') + search_type = tokens[-1] + base = endpoint.split('tweets') + endpoint = base[0] + 'tweets/counts/' + search_type + return endpoint def gen_params_from_config(config_dict): """ @@ -170,6 +209,7 @@ def intify(arg): results_per_call = intify(config_dict.get("results_per_call", None)) query = gen_request_parameters(query=config_dict["query"], + granularity=config_dict.get("granularity", None), start_time=config_dict.get("start_time", None), end_time=config_dict.get("end_time", None), since_id=config_dict.get("since_id", None), @@ -181,7 +221,6 @@ def intify(arg): poll_fields=config_dict.get("poll_fields", None), expansions=config_dict.get("expansions", None), results_per_call=results_per_call) - #count_bucket=config_dict.get("count_bucket", None)) _dict = {"endpoint": endpoint, "bearer_token": config_dict.get("bearer_token"), @@ -194,7 +233,21 @@ def intify(arg): return _dict - - +#TODO: Check if this is still needed, when code dynamically checks/updates endpoint based on use of 'granularity.' +def validate_count_api(request_parameters, endpoint): + """ + Ensures that the counts api is set correctly in a payload. + """ + rule = (request_parameters if isinstance(request_parameters, dict) + else json.loads(request_parameters)) + granularity = rule.get('granularity') + counts = set(endpoint.split("/")) & {"counts.json"} + if 'counts' not in endpoint: + if granularity is not None: + msg = ("""There is a 'granularity' present in your request, + but you are using not using the counts API. + Please check your endpoints and try again""") + logger.error(msg) + raise ValueError diff --git a/searchtweets/result_stream.py b/searchtweets/result_stream.py index e44a333..dee943d 100644 --- a/searchtweets/result_stream.py +++ b/searchtweets/result_stream.py @@ -18,6 +18,7 @@ import json from .utils import merge_dicts +from .api_utils import infer_endpoint, change_to_count_endpoint from collections import defaultdict from ._version import VERSION @@ -172,7 +173,7 @@ class ResultStream: def __init__(self, endpoint, request_parameters, bearer_token=None, extra_headers_dict=None, max_tweets=500, max_requests=None, output_format="r", **kwargs): - self.bearer_token = bearer_token + self.bearer_token = bearer_token #TODO: Add support for user tokens. self.extra_headers_dict = extra_headers_dict if isinstance(request_parameters, str): request_parameters = json.loads(request_parameters) @@ -192,7 +193,21 @@ def __init__(self, endpoint, request_parameters, bearer_token=None, extra_header # magic number of requests! self.max_requests = (max_requests if max_requests is not None else 10 ** 9) - self.endpoint = endpoint + + + + #Branching to counts or Tweets endpoint. + #TODO: unit testing + self.search_type = 'tweets' + #infer_endpoint(request_parameters) + #change_to_count_endpoint(endpoint) + self.endpoint = (change_to_count_endpoint(endpoint) + if infer_endpoint(request_parameters) == "counts" + else endpoint) + + if 'counts' in self.endpoint: + self.search_type = 'counts' + self.output_format = output_format def formatted_output(self): @@ -209,6 +224,7 @@ def extract_includes(expansion, _id="id"): else: return defaultdict(lambda: {}) + #TODO - counts does not have extractions.... So, skip if you caunt. # Users extracted both by id and by username for expanding mentions includes_users = merge_dicts(extract_includes("users"), extract_includes("users", "username")) # Tweets in includes will themselves be expanded @@ -265,18 +281,25 @@ def expand_payload(payload): return payload + #TODO: Tweets or Counts? # First, expand the included tweets, before processing actual result tweets: - for included_id, included_tweet in extract_includes("tweets").items(): - includes_tweets[included_id] = expand_payload(included_tweet) + if self.search_type == 'tweets': + for included_id, included_tweet in extract_includes("tweets").items(): + includes_tweets[included_id] = expand_payload(included_tweet) def output_response_format(): """ output the response as 1 "page" per line """ - if self.total_results >= self.max_tweets: - return + #TODO: counts details + if self.search_type == 'tweets': + if self.total_results >= self.max_tweets: + return yield self.current_response - self.total_results += self.meta['result_count'] + + #With counts, there is nothing to count here... we aren't counting Tweets (but should count requests) + if self.search_type == 'tweets': + self.total_results += self.meta['result_count'] def output_atomic_format(): """ @@ -327,6 +350,7 @@ def stream(self): >>> results = list(ResultStream(**kwargs).stream()) """ self.init_session() + #self.check_counts() #TODO: not needed if no Tweet Parser being used. self.execute_request() self.stream_started = True @@ -346,7 +370,7 @@ def stream(self): #limited to one request per sleep. #Revisit and make configurable when the requests-per-second gets revisited. if "tweets/search/all" in self.endpoint: - time.sleep(1) + time.sleep(2) self.execute_request() @@ -367,6 +391,17 @@ def init_session(self): self.session = make_session(self.bearer_token, self.extra_headers_dict) + + #TODO: not needed if no Tweet Parser being used. + def check_counts(self): + """ + Disables tweet parsing if the count API is used. + """ + if "counts" in re.split("[/.]", self.endpoint): + logger.info("disabling tweet parsing due to counts API usage") + self._tweet_func = lambda x: x + + def execute_request(self): """ Sends the request to the API and parses the json response.