https://stackoverflow.com/questions/50291963/how-to-query-in-aws-athena-connected-through-s3-using-lambda-functions-in-python
gentable.py
import boto3
import time
ftext = open("alltables.sql")
queries = []
queriestxt = []
dropddl = []
for line in ftext:
line = line.strip()
if line.startswith("--"): continue
if "drop table" in line:
dropddl.append(line)
queries.append("\n".join(queriestxt))
queriestxt = []
continue
queriestxt.append(line)
if len(queriestxt) > 0:
queries.append("\n".join(queriestxt))
queries = queries[1:]
for d in dropddl:
print("drop>", d)
for q in queries:
print("q>", q[:q.index("(")])
print("="*50)
c = boto3.client('athena')
db = 'tpcds'
#query = "select * from tpcds.store_sales limit 100"
location = "s3://yours3bucket/tpcds-tests-emr/3"
for query in dropddl:
print("query", query)
queryStart = c.start_query_execution(
QueryString = query,
QueryExecutionContext = {
'Database': db,
},
ResultConfiguration = {
'OutputLocation': 's3://<your s3 bucket>/athena/',
}
)
queryExecution = {
'QueryExecution': {
'Status': 'QUEUED'
}
}
while queryExecution['QueryExecution']['Status'] not in ['QUEUED', 'RUNNING']:
queryExecution = c.get_query_execution(
QueryExecutionId=queryStart['QueryExecutionId']
)
time.sleep(2)
for query in queries:
print("query", query)
query = query.replace("${LOCATION}", location)
queryStart = c.start_query_execution(
QueryString = query,
QueryExecutionContext = {
'Database': db,
},
ResultConfiguration = {
'OutputLocation': 's3://<your s3 bucket>/athena/',
}
)
queryExecution = {
'QueryExecution': {
'Status': {
'State': 'QUEUED'
}
}
}
while queryExecution['QueryExecution']['Status']['State'] in ['QUEUED', 'RUNNING']:
queryExecution = c.get_query_execution(
QueryExecutionId=queryStart['QueryExecutionId']
)
print("to sleep", queryExecution['QueryExecution']['Status']['State'])
time.sleep(2)
querydetail.py
import boto3
from pprint import pprint
c = boto3.client('athena')
execid = '639211eb-e9c5-4530-b0a4-3ead589b2524'
queryStart = {
'QueryExecutionId': execid,
}
queryExecution = c.get_query_execution(
QueryExecutionId=queryStart['QueryExecutionId']
)
pprint(queryExecution)
{'QueryExecution': {'EngineVersion': {'EffectiveEngineVersion': 'Athena engine '
'version 2',
'SelectedEngineVersion': 'AUTO'},
'Query': 'select * from tpcds.store_sales limit 100',
'QueryExecutionContext': {'Database': 'tpcds'},
'QueryExecutionId': '639211eb-e9c5-4530-b0a4-3ead589b2524',
'ResultConfiguration': {'OutputLocation': 's3:///athena/639211eb-e9c5-4530-b0a4-3ead589b2524.csv'},
'StatementType': 'DML',
'Statistics': {'DataScannedInBytes': 1412475,
'EngineExecutionTimeInMillis': 673,
'QueryPlanningTimeInMillis': 92,
'QueryQueueTimeInMillis': 117,
'ServiceProcessingTimeInMillis': 34,
'TotalExecutionTimeInMillis': 824},
'Status': {'CompletionDateTime': datetime.datetime(2022, 5, 24, 21, 56, 57, 315000, tzinfo=tzlocal()),
'State': 'SUCCEEDED',
'SubmissionDateTime': datetime.datetime(2022, 5, 24, 21, 56, 56, 491000, tzinfo=tzlocal())},
'WorkGroup': 'primary'},
'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
'content-length': '1449',
'content-type': 'application/x-amz-json-1.1',
'date': 'Wed, 25 May 2022 01:59:02 GMT',
'x-amzn-requestid': 'b7bb2c2e-4949-46ed-ba75-eaf67803bc46'},
'HTTPStatusCode': 200,
'RequestId': 'b7bb2c2e-4949-46ed-ba75-eaf67803bc46',
'RetryAttempts': 0}}
runquery.py
import boto3
from pprint import pprint
import time
c = boto3.client('athena')
query = """
select
sum(ss_net_profit) as total_sum
,s_state
,s_county
,grouping(s_state)+grouping(s_county) as lochierarchy
,rank() over (
partition by grouping(s_state)+grouping(s_county),
case when grouping(s_county) = 0 then s_state end
order by sum(ss_net_profit) desc) as rank_within_parent
from
store_sales
,date_dim d1
,store
where
d1.d_month_seq between 1218 and 1218+11
and d1.d_date_sk = ss_sold_date_sk
and s_store_sk = ss_store_sk
and s_state in
( select s_state
from (select s_state as s_state,
rank() over ( partition by s_state order by sum(ss_net_profit) desc) as ranking
from store_sales, store, date_dim
where d_month_seq between 1218 and 1218+11
and d_date_sk = ss_sold_date_sk
and s_store_sk = ss_store_sk
group by s_state
) tmp1
where ranking <= 5
)
group by rollup(s_state,s_county)
order by
lochierarchy desc
,case when lochierarchy = 0 then s_state end
,rank_within_parent
limit 100;
"""
db = "tpcds"
print("query", query)
queryStart = c.start_query_execution(
QueryString = query,
QueryExecutionContext = {
'Database': db,
},
ResultConfiguration = {
'OutputLocation': 's3:///athena/',
}
)
queryExecution = {
'QueryExecution': {
'Status': {
'State': 'QUEUED'
}
}
}
while queryExecution['QueryExecution']['Status']['State'] in ['QUEUED', 'RUNNING']:
queryExecution = c.get_query_execution(
QueryExecutionId=queryStart['QueryExecutionId']
)
print("to sleep", queryExecution['QueryExecution']['Status']['State'])
time.sleep(2)
print("out")
pprint(queryExecution)
if queryExecution['QueryExecution']['Status']['State'] == 'SUCCEEDED':
results = c.get_query_results(QueryExecutionId=queryStart['QueryExecutionId'])
pprint(results)
output
query
select
sum(ss_net_profit) as total_sum
,s_state
,s_county
,grouping(s_state)+grouping(s_county) as lochierarchy
,rank() over (
partition by grouping(s_state)+grouping(s_county),
case when grouping(s_county) = 0 then s_state end
order by sum(ss_net_profit) desc) as rank_within_parent
from
store_sales
,date_dim d1
,store
where
d1.d_month_seq between 1218 and 1218+11
and d1.d_date_sk = ss_sold_date_sk
and s_store_sk = ss_store_sk
and s_state in
( select s_state
from (select s_state as s_state,
rank() over ( partition by s_state order by sum(ss_net_profit) desc) as ranking
from store_sales, store, date_dim
where d_month_seq between 1218 and 1218+11
and d_date_sk = ss_sold_date_sk
and s_store_sk = ss_store_sk
group by s_state
) tmp1
where ranking <= 5
)
group by rollup(s_state,s_county)
order by
lochierarchy desc
,case when lochierarchy = 0 then s_state end
,rank_within_parent
limit 100;
to sleep QUEUED
to sleep RUNNING
to sleep RUNNING
to sleep RUNNING
to sleep SUCCEEDED
out
{'QueryExecution': {'EngineVersion': {'EffectiveEngineVersion': 'Athena engine '
'version 2',
'SelectedEngineVersion': 'AUTO'},
'Query': 'select \n'
' sum(ss_net_profit) as total_sum\n'
' ,s_state\n'
' ,s_county\n'
' ,grouping(s_state)+grouping(s_county) as '
'lochierarchy\n'
' ,rank() over (\n'
' partition by '
'grouping(s_state)+grouping(s_county),\n'
' case when grouping(s_county) = 0 then '
's_state end \n'
' order by sum(ss_net_profit) desc) as '
'rank_within_parent\n'
' from\n'
' store_sales\n'
' ,date_dim d1\n'
' ,store\n'
' where\n'
' d1.d_month_seq between 1218 and 1218+11\n'
' and d1.d_date_sk = ss_sold_date_sk\n'
' and s_store_sk = ss_store_sk\n'
' and s_state in\n'
' ( select s_state\n'
' from (select s_state as '
's_state,\n'
' rank() over ( '
'partition by s_state order by sum(ss_net_profit) '
'desc) as ranking\n'
' from store_sales, store, '
'date_dim\n'
' where d_month_seq between '
'1218 and 1218+11\n'
' and d_date_sk = '
'ss_sold_date_sk\n'
' and s_store_sk = '
'ss_store_sk\n'
' group by s_state\n'
' ) tmp1 \n'
' where ranking <= 5\n'
' )\n'
' group by rollup(s_state,s_county)\n'
' order by\n'
' lochierarchy desc\n'
' ,case when lochierarchy = 0 then s_state end\n'
' ,rank_within_parent\n'
' limit 100',
'QueryExecutionContext': {'Database': 'tpcds'},
'QueryExecutionId': '4a848c0c-61c2-42bc-a8c0-ae2dd0d6f6e8',
'ResultConfiguration': {'OutputLocation': 's3:///athena/4a848c0c-61c2-42bc-a8c0-ae2dd0d6f6e8.csv'},
'StatementType': 'DML',
'Statistics': {'DataScannedInBytes': 2381577798,
'EngineExecutionTimeInMillis': 7601,
'QueryPlanningTimeInMillis': 5276,
'QueryQueueTimeInMillis': 157,
'ServiceProcessingTimeInMillis': 31,
'TotalExecutionTimeInMillis': 7789},
'Status': {'CompletionDateTime': datetime.datetime(2022, 5, 24, 23, 41, 50, 619000, tzinfo=tzlocal()),
'State': 'SUCCEEDED',
'SubmissionDateTime': datetime.datetime(2022, 5, 24, 23, 41, 42, 830000, tzinfo=tzlocal())},
'WorkGroup': 'primary'},
'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
'content-length': '3865',
'content-type': 'application/x-amz-json-1.1',
'date': 'Wed, 25 May 2022 03:41:50 GMT',
'x-amzn-requestid': '150e24e0-3a86-40c3-af00-ae3f558dc915'},
'HTTPStatusCode': 200,
'RequestId': '150e24e0-3a86-40c3-af00-ae3f558dc915',
'RetryAttempts': 0}}
{'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
'content-length': '2813',
'content-type': 'application/x-amz-json-1.1',
'date': 'Wed, 25 May 2022 03:41:52 GMT',
'x-amzn-requestid': 'a89078dc-99a4-4b2e-ba18-b675e441b4a7'},
'HTTPStatusCode': 200,
'RequestId': 'a89078dc-99a4-4b2e-ba18-b675e441b4a7',
'RetryAttempts': 0},
'ResultSet': {'ResultSetMetadata': {'ColumnInfo': [{'CaseSensitive': False,
'CatalogName': 'hive',
'Label': 'total_sum',
'Name': 'total_sum',
'Nullable': 'UNKNOWN',
'Precision': 38,
'Scale': 2,
'SchemaName': '',
'TableName': '',
'Type': 'decimal'},
{'CaseSensitive': True,
'CatalogName': 'hive',
'Label': 's_state',
'Name': 's_state',
'Nullable': 'UNKNOWN',
'Precision': 2,
'Scale': 0,
'SchemaName': '',
'TableName': '',
'Type': 'char'},
{'CaseSensitive': True,
'CatalogName': 'hive',
'Label': 's_county',
'Name': 's_county',
'Nullable': 'UNKNOWN',
'Precision': 30,
'Scale': 0,
'SchemaName': '',
'TableName': '',
'Type': 'varchar'},
{'CaseSensitive': False,
'CatalogName': 'hive',
'Label': 'lochierarchy',
'Name': 'lochierarchy',
'Nullable': 'UNKNOWN',
'Precision': 10,
'Scale': 0,
'SchemaName': '',
'TableName': '',
'Type': 'integer'},
{'CaseSensitive': False,
'CatalogName': 'hive',
'Label': 'rank_within_parent',
'Name': 'rank_within_parent',
'Nullable': 'UNKNOWN',
'Precision': 19,
'Scale': 0,
'SchemaName': '',
'TableName': '',
'Type': 'bigint'}]},
'Rows': [{'Data': [{'VarCharValue': 'total_sum'},
{'VarCharValue': 's_state'},
{'VarCharValue': 's_county'},
{'VarCharValue': 'lochierarchy'},
{'VarCharValue': 'rank_within_parent'}]},
{'Data': [{'VarCharValue': '-1317652715.60'},
{},
{},
{'VarCharValue': '2'},
{'VarCharValue': '1'}]},
{'Data': [{'VarCharValue': '-1317652715.60'},
{'VarCharValue': 'TN'},
{},
{'VarCharValue': '1'},
{'VarCharValue': '1'}]},
{'Data': [{'VarCharValue': '-77915300.74'},
{'VarCharValue': 'TN'},
{},
{'VarCharValue': '0'},
{'VarCharValue': '1'}]},
{'Data': [{'VarCharValue': '-1239737414.86'},
{'VarCharValue': 'TN'},
{'VarCharValue': 'Williamson County'},
{'VarCharValue': '0'},
{'VarCharValue': '2'}]}]},
'UpdateCount': 0}