Skip to content

Instantly share code, notes, and snippets.

@bastosmichael
Last active November 21, 2023 05:21
Show Gist options
  • Save bastosmichael/2042249dc593b8ebdbd5ea9aca0cf026 to your computer and use it in GitHub Desktop.
Save bastosmichael/2042249dc593b8ebdbd5ea9aca0cf026 to your computer and use it in GitHub Desktop.

Revisions

  1. bastosmichael revised this gist Nov 21, 2023. 1 changed file with 69 additions and 24 deletions.
    93 changes: 69 additions & 24 deletions Predicting Orders based on delivery dates
    Original file line number Diff line number Diff line change
    @@ -6,53 +6,98 @@ from sklearn.metrics import accuracy_score
    from datetime import datetime, timedelta
    import random


    # Function to generate random dates
    def generate_random_dates(start_date, end_date, num_dates):
    date_range = pd.date_range(start_date, end_date).to_pydatetime().tolist()
    return random.choices(date_range, k=num_dates)


    # Generating made-up data
    np.random.seed(0)
    num_orders = 100
    start_date = '2020-01-01'
    end_date = '2023-12-31'
    start_date = "2020-01-01"
    end_date = "2023-12-31"

    data = {
    'vendor_id': np.random.randint(1, 10, size=num_orders),
    'order_size': np.random.choice(['small', 'medium', 'large'], size=num_orders),
    'season': np.random.choice(['winter', 'spring', 'summer', 'autumn'], size=num_orders),
    'original_estimated_date': generate_random_dates(start_date, end_date, num_orders),
    'updated_delivery_date': generate_random_dates(start_date, end_date, num_orders),
    'final_receipt_date': generate_random_dates(start_date, end_date, num_orders),
    "vendor_id": np.random.randint(1, 10, size=num_orders),
    "order_size": np.random.choice(["small", "medium", "large"], size=num_orders),
    "season": np.random.choice(
    ["winter", "spring", "summer", "autumn"], size=num_orders
    ),
    "original_estimated_date": generate_random_dates(start_date, end_date, num_orders),
    "updated_delivery_date": generate_random_dates(start_date, end_date, num_orders),
    "final_receipt_date": generate_random_dates(start_date, end_date, num_orders),
    }

    df = pd.DataFrame(data)

    df['original_delay'] = (pd.to_datetime(df['final_receipt_date']) - pd.to_datetime(df['original_estimated_date'])).dt.days
    df['updated_delay'] = (pd.to_datetime(df['final_receipt_date']) - pd.to_datetime(df['updated_delivery_date'])).dt.days
    df['delay_category'] = df['original_delay'].apply(lambda x: 1 if x > 60 else (2 if x > 90 else 0))
    # Preprocess the data
    df["original_delay"] = (
    pd.to_datetime(df["final_receipt_date"])
    - pd.to_datetime(df["original_estimated_date"])
    ).dt.days
    df["updated_delay"] = (
    pd.to_datetime(df["final_receipt_date"])
    - pd.to_datetime(df["updated_delivery_date"])
    ).dt.days
    df["delay_category"] = df["original_delay"].apply(
    lambda x: 1 if x > 60 else (2 if x > 90 else 0)
    )

    df['order_size'] = df['order_size'].map({'small': 1, 'medium': 2, 'large': 3})
    df = pd.get_dummies(df, columns=['season', 'vendor_id'])
    # Additional feature engineering
    df["order_size"] = df["order_size"].map({"small": 1, "medium": 2, "large": 3})
    df = pd.get_dummies(df, columns=["season", "vendor_id"])

    X = df.drop(columns=['delay_category', 'original_estimated_date', 'updated_delivery_date', 'final_receipt_date'])
    y = df['delay_category']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    # Splitting the dataset
    X = df.drop(
    columns=[
    "delay_category",
    "original_estimated_date",
    "updated_delivery_date",
    "final_receipt_date",
    ]
    )
    y = df["delay_category"]
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
    )

    # Training the model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Making predictions
    predictions = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, predictions)
    print(f'Model Accuracy: {accuracy}')
    print(f"Model Accuracy: {accuracy}")


    # Function to prepare and align future order data with training data
    def prepare_future_order_data(data, feature_columns):
    # Ensure all required features are present and in the correct order
    prepared_data = {col: data[col] if col in data else [0] for col in feature_columns}
    return pd.DataFrame(prepared_data)


    # Example of predicting future orders
    # Assuming we have data for a future order
    future_order_data = {
    'order_size': [2], # medium
    'season_autumn': [0], 'season_spring': [1], 'season_summer': [0], 'season_winter': [0],
    'vendor_id_1': [0], 'vendor_id_2': [1], 'vendor_id_3': [0], # ... other vendor_ids
    'original_delay': [45], # Assuming 45 days delay based on historical trends
    'updated_delay': [30], # Assuming 30 days delay based on updated info
    "order_size": [2], # medium
    "season_autumn": [0],
    "season_spring": [1],
    "season_summer": [0],
    "season_winter": [0],
    "original_delay": [45], # Assuming 45 days delay based on historical trends
    "updated_delay": [30], # Assuming 30 days delay based on updated info
    "vendor_id_2": [1], # Example vendor_id
    }
    future_order_df = pd.DataFrame(future_order_data)

    # Prepare the future order data
    future_order_df = prepare_future_order_data(future_order_data, X_train.columns)

    # Predicting the delay category for the future order
    future_prediction = model.predict(future_order_df)
    print(f'Future Order Delay Prediction: {future_prediction}')
    print(f"Future Order Delay Prediction: {future_prediction}")
  2. bastosmichael created this gist Nov 20, 2023.
    58 changes: 58 additions & 0 deletions Predicting Orders based on delivery dates
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,58 @@
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    from datetime import datetime, timedelta
    import random

    def generate_random_dates(start_date, end_date, num_dates):
    date_range = pd.date_range(start_date, end_date).to_pydatetime().tolist()
    return random.choices(date_range, k=num_dates)

    np.random.seed(0)
    num_orders = 100
    start_date = '2020-01-01'
    end_date = '2023-12-31'

    data = {
    'vendor_id': np.random.randint(1, 10, size=num_orders),
    'order_size': np.random.choice(['small', 'medium', 'large'], size=num_orders),
    'season': np.random.choice(['winter', 'spring', 'summer', 'autumn'], size=num_orders),
    'original_estimated_date': generate_random_dates(start_date, end_date, num_orders),
    'updated_delivery_date': generate_random_dates(start_date, end_date, num_orders),
    'final_receipt_date': generate_random_dates(start_date, end_date, num_orders),
    }

    df = pd.DataFrame(data)

    df['original_delay'] = (pd.to_datetime(df['final_receipt_date']) - pd.to_datetime(df['original_estimated_date'])).dt.days
    df['updated_delay'] = (pd.to_datetime(df['final_receipt_date']) - pd.to_datetime(df['updated_delivery_date'])).dt.days
    df['delay_category'] = df['original_delay'].apply(lambda x: 1 if x > 60 else (2 if x > 90 else 0))

    df['order_size'] = df['order_size'].map({'small': 1, 'medium': 2, 'large': 3})
    df = pd.get_dummies(df, columns=['season', 'vendor_id'])

    X = df.drop(columns=['delay_category', 'original_estimated_date', 'updated_delivery_date', 'final_receipt_date'])
    y = df['delay_category']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    print(f'Model Accuracy: {accuracy}')

    future_order_data = {
    'order_size': [2], # medium
    'season_autumn': [0], 'season_spring': [1], 'season_summer': [0], 'season_winter': [0],
    'vendor_id_1': [0], 'vendor_id_2': [1], 'vendor_id_3': [0], # ... other vendor_ids
    'original_delay': [45], # Assuming 45 days delay based on historical trends
    'updated_delay': [30], # Assuming 30 days delay based on updated info
    }
    future_order_df = pd.DataFrame(future_order_data)

    future_prediction = model.predict(future_order_df)
    print(f'Future Order Delay Prediction: {future_prediction}')