-
-
Save jeroos/c4336e8e177ab634a91968431d8af4cd to your computer and use it in GitHub Desktop.
| import json | |
| import os | |
| import duckdb | |
| import boto3 | |
| import datetime | |
| from typing import Any, Dict | |
| def construct_prepared_sql_and_params(sql_template, bind_params): | |
| single_value_params = {k: v for k, v in bind_params.items() if not isinstance(v, list)} | |
| list_params = {k: v for k, v in bind_params.items() if isinstance(v, list)} | |
| for key in single_value_params.keys(): | |
| sql_template = sql_template.replace(f"{{{key}}}", "?") | |
| for key, values in list_params.items(): | |
| placeholders = ', '.join(['?'] * len(values)) | |
| sql_template = sql_template.replace(f"{{{key}}}", placeholders) | |
| flat_params = [] | |
| for key in bind_params: | |
| if key in single_value_params: | |
| flat_params.append(single_value_params[key]) | |
| elif key in list_params: | |
| flat_params.extend(list_params[key]) | |
| return sql_template, flat_params | |
| def serialize_dates(row): | |
| for key, value in row.items(): | |
| if isinstance(value, datetime.date): | |
| row[key] = value.strftime('%d-%m-%Y') | |
| return row | |
| def lambda_handler(event: Dict[str, Any], context): | |
| print("Duckdb Lambda received event: " + json.dumps(event, indent=4)) | |
| file_name = event.get('file_name') | |
| sql = event.get('sql') | |
| bind_params = event.get('bind_params', {}) | |
| return_type = event.get('return_type', 'List') | |
| s3 = boto3.client('s3') | |
| bucket_name = os.getenv('BUCKET_NAME') | |
| local_file_name = f"/tmp/{file_name}" | |
| os.makedirs(os.path.dirname(local_file_name), exist_ok=True) | |
| s3.download_file(bucket_name, file_name, local_file_name) | |
| conn = duckdb.connect(database=':memory:', read_only=False) | |
| prepared_sql = sql % local_file_name | |
| try: | |
| prepared_sql, flat_params = construct_prepared_sql_and_params(prepared_sql, bind_params) | |
| print(prepared_sql) | |
| print(f"flat_params -> {flat_params}") | |
| cursor = conn.execute(prepared_sql, parameters=flat_params) | |
| columns = [description[0] for description in cursor.description] | |
| fetched_results = [dict(zip(columns, row)) for row in cursor.fetchall()] | |
| fetched_results = [serialize_dates(row) for row in fetched_results] | |
| if return_type == 'Single': | |
| results = fetched_results[0] if fetched_results else {} | |
| else: | |
| results = fetched_results | |
| finally: | |
| conn.close() | |
| print("Duckdb Lambda processing completed") | |
| return results |
Steps:
Create the Layer:
Follow the same steps as above to create a virtual environment and install the duckdb module.
Zip the contents of the virtual environment's site-packages directory.
bash
Copy code
cd myenv/lib/python3.*/site-packages
zip -r ../../../../duckdb_layer.zip .
Create a Lambda Layer in the AWS Console:
Go to the AWS Lambda console and navigate to the Layers section.
Click Create layer, give it a name, and upload the duckdb_layer.zip file.
Attach the Layer to Your Lambda Function:
In your Lambda function’s configuration, go to the Layers section and add the duckdb layer you just created.
@vinayakvanarse did you manage to get the layer for duck db working? I need some tips too, have the same error
Hi, did anyone manage to get this working please as currently I am facing the same issue.
Hi, did anyone manage to get this working please as currently I am facing the same issue.
I made 3.9 python layer x86_64 via Cloudshell in AWS console in the browser, and that works fine.
Hi @alexander-grun, thanks for your reply. Unfortunately I am still not able to make it work. Currently I am following the following steps.
- Go into AWS CloudShell
- Create a directory pythonTest2
mkdir pythonTest2
- Run the following to install the packages
pip3 install duckdb -t pythonTest2/
pip3 install duckdb -t pythonTest2/
- Zip the folder
zip -r9 pythonTest2.zip pythonTest2
- Upload the ZIP file to an S3 bucket
aws s3 cp pythonTest2.zip s3://d-lambdatest
- Create the lambda layer
- Assign the layer to the lambda function
Are these steps correct please or am I missing something?
Thanks!
@julian-calleja try also setting env variable in lambda as HOME:/tmp
The layer I made is here if you want to give it a try: arn:aws:lambda:eu-north-1:399679954749:layer:duck39:1
Hi @alexander-grun!
I just found this page, and I see that you shared your arn location. Can you please share the package itself in the zipped format, which can be uploaded to AWS Layers?
Thank you,
Balazs
Can I also get the zip? [email protected]
Here is a step-by-step guide on how to create your own Lambda Layer via Docker and upload it to AWS
https://alexander-grun.github.io/ag-blog/duck-db-aws-lambda-layer/
Thank you, I got the zip, I created a Layer with it, and attached to my function (it's strange for me, that I can't choose it from Custom Layers, only I can add with the ARN, but at least that works), and I'm still getting the same error:
Function Logs:
[ERROR] Runtime.ImportModuleError: Unable to import module 'lambda_function': No module named 'duckdb.duckdb'
Traceback (most recent call last):
The code:
import json
import duckdb
def lambda_handler(event, context):
# TODO implement
return {
'statusCode': 200,
'body': json.dumps('Hello from Lambda!')
}
If I repack it, without the "python" folder, so the subfolders right directly in the zip, I'm still getting the error
{
"errorMessage": "Unable to import module 'lambda_function': No module named 'duckdb'",
"errorType": "Runtime.ImportModuleError",
"requestId": "",
"stackTrace": []
}
@bbalazs-cubix in the environment variable of Lambda add Home: /tmp as a variable and try again
Thanks for the article, unfortunately that uses Glue, not Lambda, but it was interesting. Tried creating an environment variable called "Home" with the value "/tmp", but I'm getting the same import error. Trying to look for another solution.
this works for me.
ref: https://alexander-grun.github.io/ag-blog/duck-db-aws-lambda-layer/
this works for me.
ref: https://alexander-grun.github.io/ag-blog/duck-db-aws-lambda-layer/
To make it run on Lambda, I also need to set up home_directory. This is my simple Lambda code.
import json
import duckdb
def lambda_handler(event, context):
print('Hi')
duckdb_conn = duckdb.connect()
print(f"Installing duckdb extensions. Installing httpfs.")
duckdb_conn.sql("SET home_directory='/tmp';")
duckdb_conn.sql("install httpfs")
print(f"Loading httpfs")
duckdb_conn.sql("load httpfs")
print('LOADED!')Hi, I created the lambda function from a Docker image using ECR and it worked. Now I want to connect to MotherDuck to load the data, but I get an error:
IO Error: Can't find the home directory at ''
Specify a home directory using the SET home_directory='/path/to/dir' option.
I would appreciate it if this has already happened to someone who could help you.
Hi, I created the lambda function from a Docker image using ECR and it worked. Now I want to connect to MotherDuck to load the data, but I get an error: IO Error: Can't find the home directory at '' Specify a home directory using the SET home_directory='/path/to/dir' option.
I would appreciate it if this has already happened to someone who could help you.
how do you set the home directory in your image?
Hi @jeroos, I am working on including DuckDB in 3.12 python version of lambda function. I am attaching a layer to the lambda function. I keep getting error in lambda that [ERROR] Runtime.ImportModuleError: Unable to import module 'lambda_function': No module named 'duckdb.duckdb'. Would you please guide how you were able to execute the lambda and if you used the layer then what were your steps. I know this is basic but I have spent a good amount of tries on my mac to get it working.