#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import matplotlib.pyplot as plt

# Load the data from the Excel file
data = pd.read_excel("../检验推荐系统调查数据.xlsx")

# Calculate the average values for each column
average_values = data.mean(numeric_only=True)

# Columns to plot
columns = ["推荐准确性", "推荐相关性", "操作便捷性", "生成报告时间", "数据隐私保护", "系统可接受性"]

# Values to plot
values = average_values[columns]
plt.rcParams['font.family'] = ['SimHei']  # 使用字体名称

# Create a bar plot with adjusted width
plt.figure(figsize=(3.5,4))
plt.bar(values.index, values.values, color='skyblue', width=0.3)  # 调整柱状宽度为0.6

# Set font size for x and y ticks
plt.xticks(rotation=45, fontsize=10)  # 设置x轴标签字体大小为12
plt.yticks(fontsize=12)  # 设置y轴标签字体大小为12

# Set font size for axis labels and title
plt.xlabel('指标', fontsize=10)  # 设置x轴标题字体大小
plt.ylabel('平均分', fontsize=10)  # 设置y轴标题字体大小
plt.title('推荐系统各项指标的平均分', fontsize=12)  # 设置图表标题字体大小

# Set the limit for y-axis
plt.ylim(0, 10)

# Add value labels on the bars
for i, v in enumerate(values.values):
    plt.text(i, v + 0.1, round(v, 2), ha='center', fontsize=10)  # 设置数值标签字体大小为12

# Save the plot as a PNG file
plt.savefig('recommendation_system_average_scores.png', format='png', bbox_inches='tight')

# Save the plot as a TIFF file
plt.savefig('recommendation_system_average_scores.tiff', format='tiff', bbox_inches='tight')

# Display the plot
plt.tight_layout()
plt.show()


# In[3]:


import seaborn as sns

# Compute the correlation matrix
correlation_matrix = data[columns].corr()

# Set up the matplotlib figure
plt.figure(figsize=(10, 8))

# Generate a heatmap of the correlation matrix
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 22})

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 22}, cbar=True, cbar_kws={'label': 'Correlation Coefficient'})


# Title and labels
plt.title('推荐系统各项指标的相关性分析', fontsize=20)
plt.xticks(rotation=45, ha="right", fontsize=16)
plt.yticks(rotation=0, ha="right", fontsize=16)
plt.savefig('Heat map correlation analysis.png', format='png', bbox_inches='tight')

# Save the plot as a TIFF file
plt.savefig('Heat map correlation analysis.tiff', format='tiff', bbox_inches='tight')


# Show the plot
plt.tight_layout()
plt.show()


# In[ ]:


# In[7]:


import pandas as pd
from scipy.stats import pearsonr

# 1. 加载数据
file_path = '../检验推荐系统调查数据.xlsx'  # 替换为您的Excel文件路径
data = pd.read_excel(file_path)

# 2. 选择用于相关性分析的列
columns = ['推荐准确性', '推荐相关性', '操作便捷性', '生成报告时间', '数据隐私保护', '系统可接受性']

# 3. 检查并处理缺失值
# 检查缺失值
missing_values = data[columns].isnull().sum()
# 如果有缺失值，使用每列的均值填充
if missing_values.any():
    data[columns] = data[columns].fillna(data[columns].mean())

# 4. 计算每对列之间的p值
p_values = {}
for i, col1 in enumerate(columns):
    for j, col2 in enumerate(columns):
        if i < j:  # 只计算上三角部分，避免重复
            _, p_value = pearsonr(data[col1], data[col2])
            p_values[(col1, col2)] = p_value

# 5. 输出p值
for (col1, col2), p_value in p_values.items():
    print(f"p值 between {col1} and {col2}: {p_value:.2f}")


# In[6]:


import pandas as pd
from scipy.stats import kruskal
from scipy.stats import pearsonr

# 定义文件路径
file_path = '../检验推荐系统调查数据.xlsx'

# 加载数据
data = pd.read_excel(file_path)

# 定义评分指标列
quantitative_columns = ['推荐准确性', '推荐相关性', '操作便捷性', '生成报告时间', '数据隐私保护', '系统可接受性']
# 计算描述性统计信息
descriptive_stats = data[quantitative_columns].describe()
print("描述性统计信息:\n", descriptive_stats)

# 计算相关性矩阵
correlation_matrix = data[quantitative_columns].corr()
print("\n相关性矩阵:\n", correlation_matrix)

# 计算总体满意度
data['overall_satisfaction'] = data[quantitative_columns].mean(axis=1)

# 检查异常值并删除它们
def find_and_remove_outliers(data, columns):
    outliers = {}
    for col in columns:
        q25 = data[col].quantile(0.25)
        q75 = data[col].quantile(0.75)
        iqr = q75 - q25
        lower_bound = q25 - 1.5 * iqr
        upper_bound = q75 + 1.5 * iqr
        outliers[col] = data[(data[col] < lower_bound) | (data[col] > upper_bound)][col]
    outliers_to_remove = pd.concat([outliers[col] for col in outliers]).drop_duplicates().index
    return data.drop(outliers_to_remove)

data_cleaned = find_and_remove_outliers(data, quantitative_columns)

# 使用Kruskal-Wallis H检验测试评分指标之间的差异
kruskal_results_cleaned = kruskal(
    data_cleaned['推荐准确性'], 
    data_cleaned['推荐相关性'], 
    data_cleaned['操作便捷性'], 
    data_cleaned['生成报告时间'], 
    data_cleaned['数据隐私保护'], 
    data_cleaned['系统可接受性']
)

print("\nKruskal-Wallis H检验结果:\n", kruskal_results_cleaned)

# 使用Pearson相关系数检验总体满意度与其他指标的相关性
pearson_results = {}
for col in quantitative_columns:
    pearson_results[col], _ = pearsonr(data_cleaned['overall_satisfaction'], data_cleaned[col])

print("\nPearson相关系数检验结果:\n", pearson_results)


# In[3]:


# Analyzing the "系统改进建议" column to determine common suggestions

suggestions = data["系统改进建议"].value_counts()
# pd.set_option('display.max_colwidth', 100)
# 使用 str.wrap() 来换行文本，这里设置每行最大字符数为 30
# data["系统改进建议"] = data["系统改进建议"].str.wrap(40)
data["系统改进建议"] = data["系统改进建议"].str.replace(r'[。．]', '', regex=True)

# Analyzing the "系统对工作流程的影响" column to determine the impact on work流程
workflow_impact = data["系统对工作流程的影响"].value_counts()

# Analyzing the "系统对医疗质量的影响" column to determine the impact on medical quality
quality_impact = data["系统对医疗质量的影响"].value_counts()

# Plotting a bar chart for the top suggestions in "系统改进建议"
top_suggestions = suggestions.head(10)

plt.figure(figsize=(6, 6))
top_suggestions.plot(kind='barh', color='skyblue')
# Set font size for x and y ticks
plt.xticks(fontsize=16)  # 设置x轴标签字体大小为12
plt.yticks(fontsize=16)  # 设置y轴标签字体大小为12

plt.xlabel('出现次数', fontsize=16)
plt.ylabel('系统改进建议', fontsize=16)
plt.title('最常见的10条系统改进建议', fontsize=16)
plt.gca().invert_yaxis()  # Invert the y-axis to display the most frequent suggestions at the top
# Save the plot as a PNG file
plt.savefig('最常见的10条系统改进建议.png', format='png', bbox_inches='tight')

# Save the plot as a TIFF file
plt.savefig('最常见的10条系统改进建议.tiff', format='tiff', bbox_inches='tight')
plt.show()


# In[10]:


# Analyzing the "系统改进建议" column to determine common suggestions

suggestions = data["系统对工作流程的影响"].value_counts()
# pd.set_option('display.max_colwidth', 100)
# 使用 str.wrap() 来换行文本，这里设置每行最大字符数为 30
# data["系统改进建议"] = data["系统改进建议"].str.wrap(40)
data["系统对工作流程的影响"] = data["系统对工作流程的影响"].str.replace(r'[。．]', '', regex=True)


# Plotting a bar chart for the top suggestions in "系统改进建议"
top_suggestions = suggestions.head(10)

plt.figure(figsize=(6, 6))
top_suggestions.plot(kind='barh', color='lightcoral')
# Set font size for x and y ticks
plt.xticks(fontsize=16)  # 设置x轴标签字体大小为12
plt.yticks(fontsize=16)  # 设置y轴标签字体大小为12

plt.xlabel('出现次数', fontsize=16)
plt.ylabel('系统对工作流程的影响', fontsize=16)
plt.title('最常见的10种系统对工作流程的影响', fontsize=16)
plt.gca().invert_yaxis()  # Invert the y-axis to display the most frequent suggestions at the top
# Save the plot as a PNG file
plt.savefig('系统对工作流程的影响.png', format='png', bbox_inches='tight')

# Save the plot as a TIFF file
plt.savefig('系统对工作流程的影响.tiff', format='tiff', bbox_inches='tight')
plt.show()


# In[4]:


# Analyzing the "系统改进建议" column to determine common suggestions

suggestions = data["系统对医疗质量的影响"].value_counts()
# pd.set_option('display.max_colwidth', 100)
# 使用 str.wrap() 来换行文本，这里设置每行最大字符数为 30
# data["系统改进建议"] = data["系统改进建议"].str.wrap(40)
data["系统对医疗质量的影响"] = data["系统对医疗质量的影响"].str.replace(r'[。．]', '', regex=True)


# Plotting a bar chart for the top suggestions in "系统改进建议"
top_suggestions = suggestions.head(10)

plt.figure(figsize=(6, 6))
top_suggestions.plot(kind='barh', color='lightgreen')
# Set font size for x and y ticks
plt.xticks(fontsize=16)  # 设置x轴标签字体大小为12
plt.yticks(fontsize=16)  # 设置y轴标签字体大小为12

plt.xlabel('出现次数', fontsize=16)
plt.ylabel('系统对医疗质量的影响', fontsize=16)
plt.title('最常见的10种系统对医疗质量的影响', fontsize=16)
plt.gca().invert_yaxis()  # Invert the y-axis to display the most frequent suggestions at the top
# Save the plot as a PNG file
plt.savefig('系统对工作流程的影响.png', format='png', bbox_inches='tight')

# Save the plot as a TIFF file
plt.savefig('系统对工作流程的影响.tiff', format='tiff', bbox_inches='tight')
plt.show()


# In[10]:


import matplotlib.pyplot as plt
import seaborn as sns

# 设置字体为支持中文的字体，例如“SimHei”
plt.rcParams['font.sans-serif'] = ['SimHei']
# 为了支持负号，您还需要设置字体大小
plt.rcParams['font.size'] = 16

# 创建图形
plt.figure(figsize=(6, 4))


# Boxplots for each score category
sns.boxplot(x=data['推荐准确性'], width=0.2)
plt.xticks(fontsize=16) 
plt.title('推荐准确性的分布')
plt.ylabel('推荐准确性')
# Save the plot as a PNG file
plt.savefig('推荐准确性的分布.png', format='png', bbox_inches='tight')

# Save the plot as a TIFF file
plt.savefig('推荐准确性的分布.tiff', format='tiff', bbox_inches='tight')
plt.show()


# Boxplots for other score categories
sns.boxplot(x=data['推荐相关性'], width=0.2)
plt.xticks(fontsize=16) 
plt.title('推荐相关性的分布')
plt.ylabel('推荐相关性')

# Save the plot as a PNG file
plt.savefig('推荐相关性的分布.png', format='png', bbox_inches='tight')

# Save the plot as a TIFF file
plt.savefig('推荐相关性的分布.tiff', format='tiff', bbox_inches='tight')
plt.show()


sns.boxplot(x=data['操作便捷性'], width=0.2)
plt.xticks(fontsize=16) 
plt.title('操作便捷性的分布')
plt.ylabel('操作便捷性')


# Save the plot as a PNG file
plt.savefig('操作便捷性的分布.png', format='png', bbox_inches='tight')

# Save the plot as a TIFF file
plt.savefig('操作便捷性的分布.tiff', format='tiff', bbox_inches='tight')
plt.show()


sns.boxplot(x=data['生成报告时间'], width=0.2)
plt.xticks(fontsize=16) 
plt.title('生成报告时间的分布')
plt.ylabel('生成报告时间的总时间')

# Save the plot as a PNG file
plt.savefig('生成报告时间的分布.png', format='png', bbox_inches='tight')

# Save the plot as a TIFF file
plt.savefig('生成报告时间的分布.tiff', format='tiff', bbox_inches='tight')
plt.show()


sns.boxplot(x=data['数据隐私保护'], width=0.2)
plt.xticks(fontsize=16) 
plt.title('数据隐私保护的分布')
plt.ylabel('数据隐私保护')
# Save the plot as a PNG file
plt.savefig('数据隐私保护的分布.png', format='png', bbox_inches='tight')

# Save the plot as a TIFF file
plt.savefig('数据隐私保护的分布.tiff', format='tiff', bbox_inches='tight')
plt.show()


# In[11]:


import matplotlib.pyplot as plt
import seaborn as sns

# 设置字体为支持中文的字体，例如“SimHei”
plt.rcParams['font.sans-serif'] = ['SimHei']
# 为了支持负号，您还需要设置字体大小
plt.rcParams['font.size'] = 16

# 创建图形
plt.figure(figsize=(6, 4))


sns.boxplot(x=data['系统可接受性'], width=0.2)
plt.xticks(fontsize=16) 
plt.title('系统可接受性的分布')
plt.ylabel('系统可接受性')
# Save the plot as a PNG file
plt.savefig('系统可接受性的分布.png', format='png', bbox_inches='tight')

# Save the plot as a TIFF file
plt.savefig('系统可接受性的分布.tiff', format='tiff', bbox_inches='tight')
plt.show()


# In[12]:


import matplotlib.pyplot as plt
import seaborn as sns

# 设置字体为支持中文的字体，例如“SimHei”
plt.rcParams['font.sans-serif'] = ['SimHei']
# 为了支持负号，您还需要设置字体大小
plt.rcParams['font.size'] = 16

# 创建图形
plt.figure(figsize=(6, 4))


# 获取当前轴对象
ax = plt.gca()

# 设置边框线厚度
ax.spines['top'].set_linewidth(0.1)
ax.spines['bottom'].set_linewidth(0.1)
ax.spines['left'].set_linewidth(0.2)
ax.spines['right'].set_linewidth(0.1)


# 假设data是一个pandas DataFrame，且包含'系统可接受性'这一列
sns.boxplot(x=data['系统可接受性'], width=0.2,)
plt.xticks(fontsize=16) 
plt.title('系统可接受性的分布')
plt.ylabel('系统可接受性')

# 添加垂直线，例如在x=0.5的位置
for x in [0.5, 2, 3.5,5.5,8,10]:  # 您可以根据需要更改这些x坐标值
    plt.axvline(x=x, color='gray', linestyle='-', linewidth=0.3)

# Save the plot as a PNG file
plt.savefig('系统可接受性的分布.png', format='png', bbox_inches='tight')

# Save the plot as a TIFF file
plt.savefig('系统可接受性的分布.tiff', format='tiff', bbox_inches='tight')

plt.show()


# In[ ]: