-
-
Save jeroos/c4336e8e177ab634a91968431d8af4cd to your computer and use it in GitHub Desktop.
| import json | |
| import os | |
| import duckdb | |
| import boto3 | |
| import datetime | |
| from typing import Any, Dict | |
| def construct_prepared_sql_and_params(sql_template, bind_params): | |
| single_value_params = {k: v for k, v in bind_params.items() if not isinstance(v, list)} | |
| list_params = {k: v for k, v in bind_params.items() if isinstance(v, list)} | |
| for key in single_value_params.keys(): | |
| sql_template = sql_template.replace(f"{{{key}}}", "?") | |
| for key, values in list_params.items(): | |
| placeholders = ', '.join(['?'] * len(values)) | |
| sql_template = sql_template.replace(f"{{{key}}}", placeholders) | |
| flat_params = [] | |
| for key in bind_params: | |
| if key in single_value_params: | |
| flat_params.append(single_value_params[key]) | |
| elif key in list_params: | |
| flat_params.extend(list_params[key]) | |
| return sql_template, flat_params | |
| def serialize_dates(row): | |
| for key, value in row.items(): | |
| if isinstance(value, datetime.date): | |
| row[key] = value.strftime('%d-%m-%Y') | |
| return row | |
| def lambda_handler(event: Dict[str, Any], context): | |
| print("Duckdb Lambda received event: " + json.dumps(event, indent=4)) | |
| file_name = event.get('file_name') | |
| sql = event.get('sql') | |
| bind_params = event.get('bind_params', {}) | |
| return_type = event.get('return_type', 'List') | |
| s3 = boto3.client('s3') | |
| bucket_name = os.getenv('BUCKET_NAME') | |
| local_file_name = f"/tmp/{file_name}" | |
| os.makedirs(os.path.dirname(local_file_name), exist_ok=True) | |
| s3.download_file(bucket_name, file_name, local_file_name) | |
| conn = duckdb.connect(database=':memory:', read_only=False) | |
| prepared_sql = sql % local_file_name | |
| try: | |
| prepared_sql, flat_params = construct_prepared_sql_and_params(prepared_sql, bind_params) | |
| print(prepared_sql) | |
| print(f"flat_params -> {flat_params}") | |
| cursor = conn.execute(prepared_sql, parameters=flat_params) | |
| columns = [description[0] for description in cursor.description] | |
| fetched_results = [dict(zip(columns, row)) for row in cursor.fetchall()] | |
| fetched_results = [serialize_dates(row) for row in fetched_results] | |
| if return_type == 'Single': | |
| results = fetched_results[0] if fetched_results else {} | |
| else: | |
| results = fetched_results | |
| finally: | |
| conn.close() | |
| print("Duckdb Lambda processing completed") | |
| return results |
Hi, did anyone manage to get this working please as currently I am facing the same issue.
I made 3.9 python layer x86_64 via Cloudshell in AWS console in the browser, and that works fine.
Hi @alexander-grun, thanks for your reply. Unfortunately I am still not able to make it work. Currently I am following the following steps.
- Go into AWS CloudShell
- Create a directory pythonTest2
mkdir pythonTest2
- Run the following to install the packages
pip3 install duckdb -t pythonTest2/
pip3 install duckdb -t pythonTest2/
- Zip the folder
zip -r9 pythonTest2.zip pythonTest2
- Upload the ZIP file to an S3 bucket
aws s3 cp pythonTest2.zip s3://d-lambdatest
- Create the lambda layer
- Assign the layer to the lambda function
Are these steps correct please or am I missing something?
Thanks!
@julian-calleja try also setting env variable in lambda as HOME:/tmp
The layer I made is here if you want to give it a try: arn:aws:lambda:eu-north-1:399679954749:layer:duck39:1
Hi @alexander-grun!
I just found this page, and I see that you shared your arn location. Can you please share the package itself in the zipped format, which can be uploaded to AWS Layers?
Thank you,
Balazs
Can I also get the zip? [email protected]
Here is a step-by-step guide on how to create your own Lambda Layer via Docker and upload it to AWS
https://alexander-grun.github.io/ag-blog/duck-db-aws-lambda-layer/
Thank you, I got the zip, I created a Layer with it, and attached to my function (it's strange for me, that I can't choose it from Custom Layers, only I can add with the ARN, but at least that works), and I'm still getting the same error:
Function Logs:
[ERROR] Runtime.ImportModuleError: Unable to import module 'lambda_function': No module named 'duckdb.duckdb'
Traceback (most recent call last):
The code:
import json
import duckdb
def lambda_handler(event, context):
# TODO implement
return {
'statusCode': 200,
'body': json.dumps('Hello from Lambda!')
}
If I repack it, without the "python" folder, so the subfolders right directly in the zip, I'm still getting the error
{
"errorMessage": "Unable to import module 'lambda_function': No module named 'duckdb'",
"errorType": "Runtime.ImportModuleError",
"requestId": "",
"stackTrace": []
}
@bbalazs-cubix in the environment variable of Lambda add Home: /tmp as a variable and try again
Thanks for the article, unfortunately that uses Glue, not Lambda, but it was interesting. Tried creating an environment variable called "Home" with the value "/tmp", but I'm getting the same import error. Trying to look for another solution.
this works for me.
ref: https://alexander-grun.github.io/ag-blog/duck-db-aws-lambda-layer/
this works for me.
ref: https://alexander-grun.github.io/ag-blog/duck-db-aws-lambda-layer/
To make it run on Lambda, I also need to set up home_directory. This is my simple Lambda code.
import json
import duckdb
def lambda_handler(event, context):
print('Hi')
duckdb_conn = duckdb.connect()
print(f"Installing duckdb extensions. Installing httpfs.")
duckdb_conn.sql("SET home_directory='/tmp';")
duckdb_conn.sql("install httpfs")
print(f"Loading httpfs")
duckdb_conn.sql("load httpfs")
print('LOADED!')Hi, I created the lambda function from a Docker image using ECR and it worked. Now I want to connect to MotherDuck to load the data, but I get an error:
IO Error: Can't find the home directory at ''
Specify a home directory using the SET home_directory='/path/to/dir' option.
I would appreciate it if this has already happened to someone who could help you.
Hi, I created the lambda function from a Docker image using ECR and it worked. Now I want to connect to MotherDuck to load the data, but I get an error: IO Error: Can't find the home directory at '' Specify a home directory using the SET home_directory='/path/to/dir' option.
I would appreciate it if this has already happened to someone who could help you.
how do you set the home directory in your image?
Hi, did anyone manage to get this working please as currently I am facing the same issue.