From db85c41fa0fe0966d28b77255de1fa1f8b81d35e Mon Sep 17 00:00:00 2001
From: Luis Aleixo <luis.aleixo@cern.ch>
Date: Wed, 9 Nov 2022 11:55:47 +0100
Subject: [PATCH] added VE script and excel file

---
 .gitattributes                                |  1 +
 ..._COVID19_VE_Studies_08Sep2022_adapted.xlsx |  3 +
 caimira/scripts/data/vaccine_effectiveness.py | 56 +++++++++++++++++++
 3 files changed, 60 insertions(+)
 create mode 100644 .gitattributes
 create mode 100644 caimira/scripts/data/WeeklySummary_COVID19_VE_Studies_08Sep2022_adapted.xlsx
 create mode 100644 caimira/scripts/data/vaccine_effectiveness.py

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000..ca21a586
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.xlsx filter=lfs diff=lfs merge=lfs -text
diff --git a/caimira/scripts/data/WeeklySummary_COVID19_VE_Studies_08Sep2022_adapted.xlsx b/caimira/scripts/data/WeeklySummary_COVID19_VE_Studies_08Sep2022_adapted.xlsx
new file mode 100644
index 00000000..c3ff459d
--- /dev/null
+++ b/caimira/scripts/data/WeeklySummary_COVID19_VE_Studies_08Sep2022_adapted.xlsx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2aef0605c3668b4884b815f71c0eae93eaaa7b88d6b6c17ff97f6a86a38674a5
+size 76179581
diff --git a/caimira/scripts/data/vaccine_effectiveness.py b/caimira/scripts/data/vaccine_effectiveness.py
new file mode 100644
index 00000000..8186da64
--- /dev/null
+++ b/caimira/scripts/data/vaccine_effectiveness.py
@@ -0,0 +1,56 @@
+import pandas as pd
+from tabulate import tabulate
+
+'''
+Script file to generate the vaccine effectiveness values.
+To generate the primary vaccine effectiveness values, uncoment lines 16-21.
+To generate the booster effectiveness values, uncoment lines 26-56.
+'''
+
+# Data from 08 Sep. 2022
+file_loc = "./WeeklySummary_COVID19_VE_Studies_08Sep2022_adapted.xlsx"
+
+
+# ------- PRIMARY VACCINATION ------ #
+
+# df = pd.read_excel(file_loc, sheet_name="Primary_filtered", usecols="A, B, E")
+
+# calculate the VE value
+# df = df.drop(df[df['VE'] < 0].index)
+# ve_data = df.groupby(['vaccine'])['VE'].mean().divide(100).reset_index()
+# print(tabulate(ve_data, headers='keys', tablefmt='psql'))
+
+
+# ------- BOOSTER VACCINATION ------ #
+
+# df = pd.read_excel(file_loc, sheet_name="Booster_filtered", usecols="A, B, C, F")
+
+# # create df without the '  or  ' substring in primary vaccines
+# rows_with_or = df[df['primary series vaccine'].str.contains(' or ')]
+# rows_indexes = list(rows_with_or.index)
+# df_without_or = df.drop(labels=rows_indexes, axis=0)
+
+# # copy of all the rows that contain '  or  '
+# new_rows_with_or = rows_with_or.reset_index().copy()
+
+# # create new dataframe empty
+# rows_to_add = pd.DataFrame(columns=rows_with_or.columns)
+
+# # duplicate each row and add it into the new dataframe
+# for index, row in new_rows_with_or.iterrows():
+#     new_rows_with_or.at[index, 'primary series vaccine'] = row['primary series vaccine'].split(' or ')[0]
+#     rows_to_add.loc[index] = new_rows_with_or.loc[index]
+#     new_rows_with_or.at[index, 'primary series vaccine'] = row['primary series vaccine'].split(' or ')[1]
+#     rows_to_add.loc[len(rows_indexes)+index] = new_rows_with_or.loc[index]
+
+# # merge the dataframe without the '  or  ' with the new dataframe that has the rows divided in two 
+# final_df = pd.concat([df_without_or, rows_to_add]).reset_index().drop(columns=['index'])
+
+# # calculate the VE value
+# final_df = final_df.drop(final_df[final_df['VE'] < 0].index)
+
+# ve_data = final_df.groupby(['primary series vaccine', 'booster vaccine'])['VE'].mean().divide(100).reset_index()
+
+# result = ve_data.to_dict('records')
+
+# print(tabulate(ve_data, headers='keys', tablefmt='psql'))