willkg · March 17, 2017 15:14
diff --git a/analysis_1337688.py b/analysis_1337688.py
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.

 """To run:

 1. ``mkvirtualenv --python=/usr/bin/python3 crashids``
 2. ``pip install -r requirements.txt``
 3. ``python get_ids_1337688.py <ACCESS_KEY> <SECRET_ACCESS_KEY>``

 This gets crash ids before and after a certain build, pulls down their
 ``upload_file_minidump`` files from s3, then compares all the files and prints
 out rough analysis.

 """


 import logging
 import os
 import pathlib
 import sys

 import boto3
 from botocore.client import Config
 import requests


 logging.basicConfig(level=logging.DEBUG)


 BUCKET_NAME = 'org.mozilla.crash-stats.production.crashes'
 REGION = 'us-west-2'
 RESULTS = 100

 SUPERSEARCH_URL = "https://crash-stats.mozilla.org/api/SuperSearch/"

 HIT_TMPL = '%(date)-32s  %(product)-10s  %(version)-10s  %(uuid)-20s'
 FILENAME_TMPL = 'v1/dump/%(crashid)s'


 def build_s3_client(access_key, secret_access_key):
    session = boto3.session.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_access_key)
    client = session.client(
        service_name='s3',
        region_name=REGION,
        config=Config(s3={'addression_style': 'path'})
    )

    return client


 def get_by_query(query):
    url = SUPERSEARCH_URL + '?' + '&'.join(query)
    return requests.get(url)


 def print_results(results):
    """Print the results of the SuperSearch query

    This helps to make sure the query is correct and we're getting back
    appropriate results.

    """
    print(HIT_TMPL % {'date': 'date', 'product': 'product', 'version': 'version', 'uuid': 'uuid'})
    for res in results:
        print(HIT_TMPL % res)


 def fetch_and_save(s3_client, dir_, hits):
    """Fetch the dumps for the specified crashes and save them in specified
    directory

    """
    if not os.path.exists(dir_):
        os.makedirs(dir_)

    print(dir_)
    print_results(hits)

    for hit in hits:
        crashid = hit['uuid']
        fn = os.path.join(dir_, crashid)
        if os.path.exists(fn):
            print('already exists %s' % fn)
            continue

        print('fetching %s upload_file_minidump...' % crashid)
        with open(fn, 'wb') as fp:
            resp = s3_client.get_object(
                Bucket=BUCKET_NAME,
                Key=FILENAME_TMPL % {'crashid': crashid}
            )
            fp.write(resp['Body'].read())


 def analyze(dir_):
    """Analyze the files in the directory and print stats to stdout"""
    path = pathlib.Path(dir_)
    files = [(f, f.stat().st_size) for f in path.glob('**/*') if f.is_file()]

    files.sort(key=lambda part: part[1])

    print(dir_)
    print('   Number of files: %10d' % len(files))
    print('   Average size:    %10d' % (sum([f[1] for f in files]) / len(files)))
    print('   Median size:     %10d' % files[int(len(files) / 2)][1])
    print('   95%% size:        %10d' % files[int(len(files) * 0.95)][1])
    print('   Max size:        %10d' % files[-1][1])


 def main(args):
    access_key, secret_access_key = args

    # Build an S3 client which we'll use to pull down dump files
    s3_client = build_s3_client(access_key, secret_access_key)

    # Get all crash ids that:
    # - product: Firefox
    # - channel: nightly
    # - OS: windows
    # - build id < 20170209030214    vs. build id >= 20170209030214
    # - 2/1 to 2/28

    # We get some crashes that match our criteria per day for a 10 day range.
    # This is the "before the change" set.
    for day in range(1, 10):
        date = '2017-02-%02d' % day

        before_query = (
            'product=Firefox',
            'release_channel=nightly',
            'platform=Windows NT',
            'build_id=<20170209030214',
            'date=>' + date,
            'date=<2017-02-%02d' % (day + 1),
            '_results_number=%d' % RESULTS,
        )

        resp = get_by_query(before_query)
        hits = resp.json()['hits']

        fetch_and_save(s3_client, os.path.join('.', 'before', date), hits)

    # We get some crashes that match our criteria per day for a 10 day range.
    # This is the "after the change" set.
    for day in range(10, 19):
        date = '2017-02-%02d' % day

        after_query = (
            'product=Firefox',
            'release_channel=nightly',
            'platform=Windows NT',
            'build_id=>=20170209030214',
            'date=>' + date,
            'date=<2017-02-%02d' % (day + 1),
            '_results_number=%d' % RESULTS,
        )

        resp = get_by_query(after_query)
        hits = resp.json()['hits']

        fetch_and_save(s3_client, os.path.join('.', 'after', date), hits)

    # Analyze the before and after sets--these print to stdout
    analyze(os.path.join('.', 'before'))
    analyze(os.path.join('.', 'after'))


 if __name__ == '__main__':
    sys.exit(main(sys.argv[1:]))
	# This Source Code Form is subject to the terms of the Mozilla Public
	# License, v. 2.0. If a copy of the MPL was not distributed with this
	# file, You can obtain one at http://mozilla.org/MPL/2.0/.

	"""To run:

	1. ``mkvirtualenv --python=/usr/bin/python3 crashids``
	2. ``pip install -r requirements.txt``
	3. ``python get_ids_1337688.py <ACCESS_KEY> <SECRET_ACCESS_KEY>``

	This gets crash ids before and after a certain build, pulls down their
	``upload_file_minidump`` files from s3, then compares all the files and prints
	out rough analysis.

	"""


	import logging
	import os
	import pathlib
	import sys

	import boto3
	from botocore.client import Config
	import requests


	logging.basicConfig(level=logging.DEBUG)


	BUCKET_NAME = 'org.mozilla.crash-stats.production.crashes'
	REGION = 'us-west-2'
	RESULTS = 100

	SUPERSEARCH_URL = "https://crash-stats.mozilla.org/api/SuperSearch/"

	HIT_TMPL = '%(date)-32s %(product)-10s %(version)-10s %(uuid)-20s'
	FILENAME_TMPL = 'v1/dump/%(crashid)s'


	def build_s3_client(access_key, secret_access_key):
	session = boto3.session.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_access_key)
	client = session.client(
	service_name='s3',
	region_name=REGION,
	config=Config(s3={'addression_style': 'path'})
	)

	return client


	def get_by_query(query):
	url = SUPERSEARCH_URL + '?' + '&'.join(query)
	return requests.get(url)


	def print_results(results):
	"""Print the results of the SuperSearch query

	This helps to make sure the query is correct and we're getting back
	appropriate results.

	"""
	print(HIT_TMPL % {'date': 'date', 'product': 'product', 'version': 'version', 'uuid': 'uuid'})
	for res in results:
	print(HIT_TMPL % res)


	def fetch_and_save(s3_client, dir_, hits):
	"""Fetch the dumps for the specified crashes and save them in specified
	directory

	"""
	if not os.path.exists(dir_):
	os.makedirs(dir_)

	print(dir_)
	print_results(hits)

	for hit in hits:
	crashid = hit['uuid']
	fn = os.path.join(dir_, crashid)
	if os.path.exists(fn):
	print('already exists %s' % fn)
	continue

	print('fetching %s upload_file_minidump...' % crashid)
	with open(fn, 'wb') as fp:
	resp = s3_client.get_object(
	Bucket=BUCKET_NAME,
	Key=FILENAME_TMPL % {'crashid': crashid}
	)
	fp.write(resp['Body'].read())


	def analyze(dir_):
	"""Analyze the files in the directory and print stats to stdout"""
	path = pathlib.Path(dir_)
	files = [(f, f.stat().st_size) for f in path.glob('*/') if f.is_file()]

	files.sort(key=lambda part: part[1])

	print(dir_)
	print(' Number of files: %10d' % len(files))
	print(' Average size: %10d' % (sum([f[1] for f in files]) / len(files)))
	print(' Median size: %10d' % files[int(len(files) / 2)][1])
	print(' 95%% size: %10d' % files[int(len(files) * 0.95)][1])
	print(' Max size: %10d' % files[-1][1])


	def main(args):
	access_key, secret_access_key = args

	# Build an S3 client which we'll use to pull down dump files
	s3_client = build_s3_client(access_key, secret_access_key)

	# Get all crash ids that:
	# - product: Firefox
	# - channel: nightly
	# - OS: windows
	# - build id < 20170209030214 vs. build id >= 20170209030214
	# - 2/1 to 2/28

	# We get some crashes that match our criteria per day for a 10 day range.
	# This is the "before the change" set.
	for day in range(1, 10):
	date = '2017-02-%02d' % day

	before_query = (
	'product=Firefox',
	'release_channel=nightly',
	'platform=Windows NT',
	'build_id=<20170209030214',
	'date=>' + date,
	'date=<2017-02-%02d' % (day + 1),
	'_results_number=%d' % RESULTS,
	)

	resp = get_by_query(before_query)
	hits = resp.json()['hits']

	fetch_and_save(s3_client, os.path.join('.', 'before', date), hits)

	# We get some crashes that match our criteria per day for a 10 day range.
	# This is the "after the change" set.
	for day in range(10, 19):
	date = '2017-02-%02d' % day

	after_query = (
	'product=Firefox',
	'release_channel=nightly',
	'platform=Windows NT',
	'build_id=>=20170209030214',
	'date=>' + date,
	'date=<2017-02-%02d' % (day + 1),
	'_results_number=%d' % RESULTS,
	)

	resp = get_by_query(after_query)
	hits = resp.json()['hits']

	fetch_and_save(s3_client, os.path.join('.', 'after', date), hits)

	# Analyze the before and after sets--these print to stdout
	analyze(os.path.join('.', 'before'))
	analyze(os.path.join('.', 'after'))


	if __name__ == '__main__':
	sys.exit(main(sys.argv[1:]))