Carlos Aguni

Highly motivated self-taught IT analyst. Always learning and ready to explore new skills. An eternal apprentice.


Boto3 Athena

25 May 2022 »

https://stackoverflow.com/questions/50291963/how-to-query-in-aws-athena-connected-through-s3-using-lambda-functions-in-python

gentable.py

import boto3
import time



ftext = open("alltables.sql")


queries = []
queriestxt = []
dropddl = []
for line in ftext:
    line = line.strip()
    if line.startswith("--"): continue
    if "drop table" in line:
        dropddl.append(line)
        queries.append("\n".join(queriestxt))
        queriestxt = []
        continue
    queriestxt.append(line)
if len(queriestxt) > 0:
    queries.append("\n".join(queriestxt))

queries = queries[1:]

for d in dropddl:
    print("drop>", d)

for q in queries:
    print("q>", q[:q.index("(")])
    print("="*50)


c = boto3.client('athena')

db = 'tpcds'

#query = "select * from tpcds.store_sales limit 100"
location = "s3://yours3bucket/tpcds-tests-emr/3"

for query in dropddl:

    print("query", query)

    queryStart = c.start_query_execution(
        QueryString = query,
        QueryExecutionContext = {
            'Database': db,
        },
        ResultConfiguration = {
            'OutputLocation': 's3://<your s3 bucket>/athena/',
        }
    )

    queryExecution = {
        'QueryExecution': {
            'Status': 'QUEUED'
        }
    }
    
    while queryExecution['QueryExecution']['Status'] not in ['QUEUED', 'RUNNING']:
        queryExecution = c.get_query_execution(
            QueryExecutionId=queryStart['QueryExecutionId']
        )
        time.sleep(2)


for query in queries:

    print("query", query)
    query = query.replace("${LOCATION}", location)

    queryStart = c.start_query_execution(
        QueryString = query,
        QueryExecutionContext = {
            'Database': db,
        },
        ResultConfiguration = {
            'OutputLocation': 's3://<your s3 bucket>/athena/',
        }
    )

    queryExecution = {
        'QueryExecution': {
            'Status': {
                'State': 'QUEUED'
            }
        }
    }
    
    while queryExecution['QueryExecution']['Status']['State'] in ['QUEUED', 'RUNNING']:
        queryExecution = c.get_query_execution(
            QueryExecutionId=queryStart['QueryExecutionId']
        )
        print("to sleep", queryExecution['QueryExecution']['Status']['State'])
        time.sleep(2)

querydetail.py

import boto3
from pprint import pprint

c = boto3.client('athena')


execid = '639211eb-e9c5-4530-b0a4-3ead589b2524' 

queryStart = {
    'QueryExecutionId': execid,
}
queryExecution = c.get_query_execution(
    QueryExecutionId=queryStart['QueryExecutionId']
)

pprint(queryExecution)

{'QueryExecution': {'EngineVersion': {'EffectiveEngineVersion': 'Athena engine '
                                                                'version 2',
                                      'SelectedEngineVersion': 'AUTO'},
                    'Query': 'select * from tpcds.store_sales limit 100',
                    'QueryExecutionContext': {'Database': 'tpcds'},
                    'QueryExecutionId': '639211eb-e9c5-4530-b0a4-3ead589b2524',
                    'ResultConfiguration': {'OutputLocation': 's3:///athena/639211eb-e9c5-4530-b0a4-3ead589b2524.csv'},
                    'StatementType': 'DML',
                    'Statistics': {'DataScannedInBytes': 1412475,
                                   'EngineExecutionTimeInMillis': 673,
                                   'QueryPlanningTimeInMillis': 92,
                                   'QueryQueueTimeInMillis': 117,
                                   'ServiceProcessingTimeInMillis': 34,
                                   'TotalExecutionTimeInMillis': 824},
                    'Status': {'CompletionDateTime': datetime.datetime(2022, 5, 24, 21, 56, 57, 315000, tzinfo=tzlocal()),
                               'State': 'SUCCEEDED',
                               'SubmissionDateTime': datetime.datetime(2022, 5, 24, 21, 56, 56, 491000, tzinfo=tzlocal())},
                    'WorkGroup': 'primary'},
 'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
                                      'content-length': '1449',
                                      'content-type': 'application/x-amz-json-1.1',
                                      'date': 'Wed, 25 May 2022 01:59:02 GMT',
                                      'x-amzn-requestid': 'b7bb2c2e-4949-46ed-ba75-eaf67803bc46'},
                      'HTTPStatusCode': 200,
                      'RequestId': 'b7bb2c2e-4949-46ed-ba75-eaf67803bc46',
                      'RetryAttempts': 0}}

runquery.py

import boto3
from pprint import pprint
import time


c = boto3.client('athena')

query = """
select  
    sum(ss_net_profit) as total_sum
   ,s_state
   ,s_county
   ,grouping(s_state)+grouping(s_county) as lochierarchy
   ,rank() over (
        partition by grouping(s_state)+grouping(s_county),
        case when grouping(s_county) = 0 then s_state end 
        order by sum(ss_net_profit) desc) as rank_within_parent
 from
    store_sales
   ,date_dim       d1
   ,store
 where
    d1.d_month_seq between 1218 and 1218+11
 and d1.d_date_sk = ss_sold_date_sk
 and s_store_sk  = ss_store_sk
 and s_state in
             ( select s_state
               from  (select s_state as s_state,
                            rank() over ( partition by s_state order by sum(ss_net_profit) desc) as ranking
                      from   store_sales, store, date_dim
                      where  d_month_seq between 1218 and 1218+11
                            and d_date_sk = ss_sold_date_sk
                            and s_store_sk  = ss_store_sk
                      group by s_state
                     ) tmp1 
               where ranking <= 5
             )
 group by rollup(s_state,s_county)
 order by
   lochierarchy desc
  ,case when lochierarchy = 0 then s_state end
  ,rank_within_parent
 limit 100;
"""

db = "tpcds"
print("query", query)

queryStart = c.start_query_execution(
    QueryString = query,
    QueryExecutionContext = {
        'Database': db,
    },
    ResultConfiguration = {
        'OutputLocation': 's3:///athena/',
    }
)

queryExecution = {
    'QueryExecution': {
        'Status': {
            'State': 'QUEUED'
        }
    }
}

while queryExecution['QueryExecution']['Status']['State'] in ['QUEUED', 'RUNNING']:
    queryExecution = c.get_query_execution(
        QueryExecutionId=queryStart['QueryExecutionId']
    )
    print("to sleep", queryExecution['QueryExecution']['Status']['State'])
    time.sleep(2)

print("out")
pprint(queryExecution)

if queryExecution['QueryExecution']['Status']['State'] == 'SUCCEEDED':
    results = c.get_query_results(QueryExecutionId=queryStart['QueryExecutionId'])
    pprint(results)

output

query 
select  
    sum(ss_net_profit) as total_sum
   ,s_state
   ,s_county
   ,grouping(s_state)+grouping(s_county) as lochierarchy
   ,rank() over (
        partition by grouping(s_state)+grouping(s_county),
        case when grouping(s_county) = 0 then s_state end 
        order by sum(ss_net_profit) desc) as rank_within_parent
 from
    store_sales
   ,date_dim       d1
   ,store
 where
    d1.d_month_seq between 1218 and 1218+11
 and d1.d_date_sk = ss_sold_date_sk
 and s_store_sk  = ss_store_sk
 and s_state in
             ( select s_state
               from  (select s_state as s_state,
                            rank() over ( partition by s_state order by sum(ss_net_profit) desc) as ranking
                      from   store_sales, store, date_dim
                      where  d_month_seq between 1218 and 1218+11
                            and d_date_sk = ss_sold_date_sk
                            and s_store_sk  = ss_store_sk
                      group by s_state
                     ) tmp1 
               where ranking <= 5
             )
 group by rollup(s_state,s_county)
 order by
   lochierarchy desc
  ,case when lochierarchy = 0 then s_state end
  ,rank_within_parent
 limit 100;

to sleep QUEUED
to sleep RUNNING
to sleep RUNNING
to sleep RUNNING
to sleep SUCCEEDED
out
{'QueryExecution': {'EngineVersion': {'EffectiveEngineVersion': 'Athena engine '
                                                                'version 2',
                                      'SelectedEngineVersion': 'AUTO'},
                    'Query': 'select  \n'
                             '    sum(ss_net_profit) as total_sum\n'
                             '   ,s_state\n'
                             '   ,s_county\n'
                             '   ,grouping(s_state)+grouping(s_county) as '
                             'lochierarchy\n'
                             '   ,rank() over (\n'
                             '        partition by '
                             'grouping(s_state)+grouping(s_county),\n'
                             '        case when grouping(s_county) = 0 then '
                             's_state end \n'
                             '        order by sum(ss_net_profit) desc) as '
                             'rank_within_parent\n'
                             ' from\n'
                             '    store_sales\n'
                             '   ,date_dim       d1\n'
                             '   ,store\n'
                             ' where\n'
                             '    d1.d_month_seq between 1218 and 1218+11\n'
                             ' and d1.d_date_sk = ss_sold_date_sk\n'
                             ' and s_store_sk  = ss_store_sk\n'
                             ' and s_state in\n'
                             '             ( select s_state\n'
                             '               from  (select s_state as '
                             's_state,\n'
                             '                            rank() over ( '
                             'partition by s_state order by sum(ss_net_profit) '
                             'desc) as ranking\n'
                             '                      from   store_sales, store, '
                             'date_dim\n'
                             '                      where  d_month_seq between '
                             '1218 and 1218+11\n'
                             '                            and d_date_sk = '
                             'ss_sold_date_sk\n'
                             '                            and s_store_sk  = '
                             'ss_store_sk\n'
                             '                      group by s_state\n'
                             '                     ) tmp1 \n'
                             '               where ranking <= 5\n'
                             '             )\n'
                             ' group by rollup(s_state,s_county)\n'
                             ' order by\n'
                             '   lochierarchy desc\n'
                             '  ,case when lochierarchy = 0 then s_state end\n'
                             '  ,rank_within_parent\n'
                             ' limit 100',
                    'QueryExecutionContext': {'Database': 'tpcds'},
                    'QueryExecutionId': '4a848c0c-61c2-42bc-a8c0-ae2dd0d6f6e8',
                    'ResultConfiguration': {'OutputLocation': 's3:///athena/4a848c0c-61c2-42bc-a8c0-ae2dd0d6f6e8.csv'},
                    'StatementType': 'DML',
                    'Statistics': {'DataScannedInBytes': 2381577798,
                                   'EngineExecutionTimeInMillis': 7601,
                                   'QueryPlanningTimeInMillis': 5276,
                                   'QueryQueueTimeInMillis': 157,
                                   'ServiceProcessingTimeInMillis': 31,
                                   'TotalExecutionTimeInMillis': 7789},
                    'Status': {'CompletionDateTime': datetime.datetime(2022, 5, 24, 23, 41, 50, 619000, tzinfo=tzlocal()),
                               'State': 'SUCCEEDED',
                               'SubmissionDateTime': datetime.datetime(2022, 5, 24, 23, 41, 42, 830000, tzinfo=tzlocal())},
                    'WorkGroup': 'primary'},
 'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
                                      'content-length': '3865',
                                      'content-type': 'application/x-amz-json-1.1',
                                      'date': 'Wed, 25 May 2022 03:41:50 GMT',
                                      'x-amzn-requestid': '150e24e0-3a86-40c3-af00-ae3f558dc915'},
                      'HTTPStatusCode': 200,
                      'RequestId': '150e24e0-3a86-40c3-af00-ae3f558dc915',
                      'RetryAttempts': 0}}
{'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
                                      'content-length': '2813',
                                      'content-type': 'application/x-amz-json-1.1',
                                      'date': 'Wed, 25 May 2022 03:41:52 GMT',
                                      'x-amzn-requestid': 'a89078dc-99a4-4b2e-ba18-b675e441b4a7'},
                      'HTTPStatusCode': 200,
                      'RequestId': 'a89078dc-99a4-4b2e-ba18-b675e441b4a7',
                      'RetryAttempts': 0},
 'ResultSet': {'ResultSetMetadata': {'ColumnInfo': [{'CaseSensitive': False,
                                                     'CatalogName': 'hive',
                                                     'Label': 'total_sum',
                                                     'Name': 'total_sum',
                                                     'Nullable': 'UNKNOWN',
                                                     'Precision': 38,
                                                     'Scale': 2,
                                                     'SchemaName': '',
                                                     'TableName': '',
                                                     'Type': 'decimal'},
                                                    {'CaseSensitive': True,
                                                     'CatalogName': 'hive',
                                                     'Label': 's_state',
                                                     'Name': 's_state',
                                                     'Nullable': 'UNKNOWN',
                                                     'Precision': 2,
                                                     'Scale': 0,
                                                     'SchemaName': '',
                                                     'TableName': '',
                                                     'Type': 'char'},
                                                    {'CaseSensitive': True,
                                                     'CatalogName': 'hive',
                                                     'Label': 's_county',
                                                     'Name': 's_county',
                                                     'Nullable': 'UNKNOWN',
                                                     'Precision': 30,
                                                     'Scale': 0,
                                                     'SchemaName': '',
                                                     'TableName': '',
                                                     'Type': 'varchar'},
                                                    {'CaseSensitive': False,
                                                     'CatalogName': 'hive',
                                                     'Label': 'lochierarchy',
                                                     'Name': 'lochierarchy',
                                                     'Nullable': 'UNKNOWN',
                                                     'Precision': 10,
                                                     'Scale': 0,
                                                     'SchemaName': '',
                                                     'TableName': '',
                                                     'Type': 'integer'},
                                                    {'CaseSensitive': False,
                                                     'CatalogName': 'hive',
                                                     'Label': 'rank_within_parent',
                                                     'Name': 'rank_within_parent',
                                                     'Nullable': 'UNKNOWN',
                                                     'Precision': 19,
                                                     'Scale': 0,
                                                     'SchemaName': '',
                                                     'TableName': '',
                                                     'Type': 'bigint'}]},
               'Rows': [{'Data': [{'VarCharValue': 'total_sum'},
                                  {'VarCharValue': 's_state'},
                                  {'VarCharValue': 's_county'},
                                  {'VarCharValue': 'lochierarchy'},
                                  {'VarCharValue': 'rank_within_parent'}]},
                        {'Data': [{'VarCharValue': '-1317652715.60'},
                                  {},
                                  {},
                                  {'VarCharValue': '2'},
                                  {'VarCharValue': '1'}]},
                        {'Data': [{'VarCharValue': '-1317652715.60'},
                                  {'VarCharValue': 'TN'},
                                  {},
                                  {'VarCharValue': '1'},
                                  {'VarCharValue': '1'}]},
                        {'Data': [{'VarCharValue': '-77915300.74'},
                                  {'VarCharValue': 'TN'},
                                  {},
                                  {'VarCharValue': '0'},
                                  {'VarCharValue': '1'}]},
                        {'Data': [{'VarCharValue': '-1239737414.86'},
                                  {'VarCharValue': 'TN'},
                                  {'VarCharValue': 'Williamson County'},
                                  {'VarCharValue': '0'},
                                  {'VarCharValue': '2'}]}]},
 'UpdateCount': 0}