Last active
November 21, 2023 05:21
-
-
Save bastosmichael/2042249dc593b8ebdbd5ea9aca0cf026 to your computer and use it in GitHub Desktop.
Revisions
-
bastosmichael revised this gist
Nov 21, 2023 . 1 changed file with 69 additions and 24 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -6,53 +6,98 @@ from sklearn.metrics import accuracy_score from datetime import datetime, timedelta import random # Function to generate random dates def generate_random_dates(start_date, end_date, num_dates): date_range = pd.date_range(start_date, end_date).to_pydatetime().tolist() return random.choices(date_range, k=num_dates) # Generating made-up data np.random.seed(0) num_orders = 100 start_date = "2020-01-01" end_date = "2023-12-31" data = { "vendor_id": np.random.randint(1, 10, size=num_orders), "order_size": np.random.choice(["small", "medium", "large"], size=num_orders), "season": np.random.choice( ["winter", "spring", "summer", "autumn"], size=num_orders ), "original_estimated_date": generate_random_dates(start_date, end_date, num_orders), "updated_delivery_date": generate_random_dates(start_date, end_date, num_orders), "final_receipt_date": generate_random_dates(start_date, end_date, num_orders), } df = pd.DataFrame(data) # Preprocess the data df["original_delay"] = ( pd.to_datetime(df["final_receipt_date"]) - pd.to_datetime(df["original_estimated_date"]) ).dt.days df["updated_delay"] = ( pd.to_datetime(df["final_receipt_date"]) - pd.to_datetime(df["updated_delivery_date"]) ).dt.days df["delay_category"] = df["original_delay"].apply( lambda x: 1 if x > 60 else (2 if x > 90 else 0) ) # Additional feature engineering df["order_size"] = df["order_size"].map({"small": 1, "medium": 2, "large": 3}) df = pd.get_dummies(df, columns=["season", "vendor_id"]) # Splitting the dataset X = df.drop( columns=[ "delay_category", "original_estimated_date", "updated_delivery_date", "final_receipt_date", ] ) y = df["delay_category"] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42 ) # Training the model model = RandomForestClassifier(random_state=42) model.fit(X_train, y_train) # Making predictions predictions = model.predict(X_test) # Evaluate the model accuracy = accuracy_score(y_test, predictions) print(f"Model Accuracy: {accuracy}") # Function to prepare and align future order data with training data def prepare_future_order_data(data, feature_columns): # Ensure all required features are present and in the correct order prepared_data = {col: data[col] if col in data else [0] for col in feature_columns} return pd.DataFrame(prepared_data) # Example of predicting future orders # Assuming we have data for a future order future_order_data = { "order_size": [2], # medium "season_autumn": [0], "season_spring": [1], "season_summer": [0], "season_winter": [0], "original_delay": [45], # Assuming 45 days delay based on historical trends "updated_delay": [30], # Assuming 30 days delay based on updated info "vendor_id_2": [1], # Example vendor_id } # Prepare the future order data future_order_df = prepare_future_order_data(future_order_data, X_train.columns) # Predicting the delay category for the future order future_prediction = model.predict(future_order_df) print(f"Future Order Delay Prediction: {future_prediction}") -
bastosmichael created this gist
Nov 20, 2023 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,58 @@ import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from datetime import datetime, timedelta import random def generate_random_dates(start_date, end_date, num_dates): date_range = pd.date_range(start_date, end_date).to_pydatetime().tolist() return random.choices(date_range, k=num_dates) np.random.seed(0) num_orders = 100 start_date = '2020-01-01' end_date = '2023-12-31' data = { 'vendor_id': np.random.randint(1, 10, size=num_orders), 'order_size': np.random.choice(['small', 'medium', 'large'], size=num_orders), 'season': np.random.choice(['winter', 'spring', 'summer', 'autumn'], size=num_orders), 'original_estimated_date': generate_random_dates(start_date, end_date, num_orders), 'updated_delivery_date': generate_random_dates(start_date, end_date, num_orders), 'final_receipt_date': generate_random_dates(start_date, end_date, num_orders), } df = pd.DataFrame(data) df['original_delay'] = (pd.to_datetime(df['final_receipt_date']) - pd.to_datetime(df['original_estimated_date'])).dt.days df['updated_delay'] = (pd.to_datetime(df['final_receipt_date']) - pd.to_datetime(df['updated_delivery_date'])).dt.days df['delay_category'] = df['original_delay'].apply(lambda x: 1 if x > 60 else (2 if x > 90 else 0)) df['order_size'] = df['order_size'].map({'small': 1, 'medium': 2, 'large': 3}) df = pd.get_dummies(df, columns=['season', 'vendor_id']) X = df.drop(columns=['delay_category', 'original_estimated_date', 'updated_delivery_date', 'final_receipt_date']) y = df['delay_category'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) model = RandomForestClassifier(random_state=42) model.fit(X_train, y_train) predictions = model.predict(X_test) accuracy = accuracy_score(y_test, predictions) print(f'Model Accuracy: {accuracy}') future_order_data = { 'order_size': [2], # medium 'season_autumn': [0], 'season_spring': [1], 'season_summer': [0], 'season_winter': [0], 'vendor_id_1': [0], 'vendor_id_2': [1], 'vendor_id_3': [0], # ... other vendor_ids 'original_delay': [45], # Assuming 45 days delay based on historical trends 'updated_delay': [30], # Assuming 30 days delay based on updated info } future_order_df = pd.DataFrame(future_order_data) future_prediction = model.predict(future_order_df) print(f'Future Order Delay Prediction: {future_prediction}')