Revision | d7d1d1aea8bf51989975e4c3205c635f9a49c881 (tree) |
---|---|
Time | 2022-02-16 06:48:30 |
Author | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
A real code to generate a flipbook of one of my scripts.
@@ -0,0 +1,200 @@ | ||
1 | +--- | |
2 | +title: "TAM Data Preparation" | |
3 | +subtitle: "A Step-by-step Guide" | |
4 | +author: "Lorenzo Isella" | |
5 | +output: | |
6 | + xaringan::moon_reader: | |
7 | + lib_dir: libs | |
8 | + css: [default, hygge, ninjutsu] | |
9 | + nature: | |
10 | + ratio: 16:10 | |
11 | + highlightStyle: github | |
12 | + highlightLines: true | |
13 | + countIncrementalSlides: false | |
14 | +--- | |
15 | + | |
16 | +```{r setup, include = FALSE} | |
17 | +library(flipbookr) | |
18 | +library(tidyverse) | |
19 | +knitr::opts_chunk$set(fig.width = 6, message = FALSE, | |
20 | + warning = FALSE, comment = "", | |
21 | + cache = F) | |
22 | +``` | |
23 | + | |
24 | +```{css, eval = TRUE, echo = FALSE} | |
25 | +.remark-code{line-height: 1.5; font-size: 80%} | |
26 | +@media print { | |
27 | + .has-continuation { | |
28 | + display: block; | |
29 | + } | |
30 | +} | |
31 | +``` | |
32 | + | |
33 | + | |
34 | + | |
35 | +```{r covid, include = FALSE} | |
36 | +library(tidyverse) | |
37 | +library(janitor) | |
38 | +library(openxlsx) | |
39 | +library(stringr) | |
40 | +library(lubridate) | |
41 | + | |
42 | + | |
43 | +source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R") | |
44 | + | |
45 | +covid_data_ini <- read_excel("SA-Covid19.xlsx") #BREAK | |
46 | + | |
47 | + | |
48 | +covid_data <- covid_data_ini %>% | |
49 | + clean_data() %>% | |
50 | + select(case_reference) %>% | |
51 | + distinct %>% | |
52 | + mutate(is_covid_case="Yes") | |
53 | + | |
54 | +covid_data | |
55 | + | |
56 | +``` | |
57 | + | |
58 | +```{r proc, include = FALSE} | |
59 | +codes <- read_csv("procedure_codes.csv") | |
60 | +codes | |
61 | +``` | |
62 | +--- | |
63 | +First we load some libraries and the file containing the covid identifier. | |
64 | +--- | |
65 | +`r chunk_reveal(chunk_name = "covid" , break_type = "user" , | |
66 | + title = "### Set up the libraries and read the covid identifier file")` | |
67 | + | |
68 | + | |
69 | +--- | |
70 | +Then we read the file which contains the list of the procedures | |
71 | +--- | |
72 | +`r chunk_reveal(chunk_name = "proc" , | |
73 | + title = "### Read the file with the procedure list")` | |
74 | + | |
75 | +```{r tam_raw, include = FALSE} | |
76 | +df_ini <- read_all_csv_to_char(pattern="(tam).*\\.csv$") | |
77 | +df_ini | |
78 | +``` | |
79 | +--- | |
80 | +Then we read the TAM data as extracted by R3 (with a parser we have | |
81 | +already converted the Excel files to csv) | |
82 | +--- | |
83 | +`r chunk_reveal(chunk_name = "tam_raw" , | |
84 | + title = "### Read the TAM data extracted by Kurt and Praveen already converted to csv format")` | |
85 | + | |
86 | + | |
87 | +```{r tam_clean, include = FALSE} | |
88 | +df <- df_ini %>% clean_names() %>% | |
89 | + distinct(aid_award_reference, .keep_all=T) %>% | |
90 | + filter(beneficiary_country %!in% c("Spain", "Poland", "Romania")) %>% | |
91 | + clean_data() %>% | |
92 | + mutate(across(contains("aid_absolute_eur"), ~as.numeric(.x))) %>% | |
93 | + mutate(across(contains("date"), ~mdy(.x))) %>% | |
94 | + mutate(year=year(aid_award_granted_date)) %>% | |
95 | + mutate(lower_bound=str_extract_before(granted_range_eur,"-")) %>% | |
96 | + mutate(upper_bound=str_extract_after(granted_range_eur,"-")) %>% | |
97 | + mutate(lower_bound=as.numeric(lower_bound), | |
98 | + upper_bound=as.numeric(upper_bound)) %>% | |
99 | + mutate(estimated_value=(lower_bound+upper_bound)/2) %>% pattern_to_na(0) %>% | |
100 | + mutate(granted_value_extended_eur = case_when( | |
101 | + !is.na(granted_aid_absolute_eur) ~ granted_aid_absolute_eur, | |
102 | + is.na(granted_aid_absolute_eur) & !is.na(estimated_value) ~estimated_value, | |
103 | + is.na(granted_aid_absolute_eur) & is.na(estimated_value) ~ nominal_aid_absolute_eur)) %>% | |
104 | + mutate(nominal_value_extended_eur= | |
105 | + case_when(!is.na(nominal_aid_absolute_eur) ~ nominal_aid_absolute_eur, | |
106 | + is.na(nominal_aid_absolute_eur)~granted_value_extended_eur )) %>% | |
107 | + select(-c(lower_bound, upper_bound, estimated_value)) %>% | |
108 | + mutate(is_covid_case=if_else(case_reference %in% covid_data$case_reference, | |
109 | + "Yes", "No")) %>% | |
110 | + mutate(granted_value_extended_eur=if_else(is_covid_case=="Yes", | |
111 | + NA_real_,granted_value_extended_eur )) | |
112 | + | |
113 | +df %>% glimpse | |
114 | +``` | |
115 | +--- | |
116 | +The crucial part of the code: I try to infer the nominal amounts and | |
117 | +aid elements when they are not given explicitly in TAM. | |
118 | +If the aid element is not given directly, but a range is available, I | |
119 | +use the mid-point to estimate the aid element. If neither is | |
120 | +available, then I assume that the aid element is equal to the nominal | |
121 | +value. | |
122 | +Then I deal with the nominal value. | |
123 | +When the nominal amount is not | |
124 | +available, I replace it with the aid element I calculated before. | |
125 | + | |
126 | +Finally, I turn again to the aid element and, | |
127 | +for all the cases with the covid identifier equal to "yes", I set the | |
128 | +aid element as not available (since in that case the | |
129 | +amounts are always nominal and if they are given as aid elements, then | |
130 | +it is a mistake in the TAM encodying or in the info provided by the | |
131 | +MS). | |
132 | + | |
133 | +On top of that, I make sure that the beneficiary country is not one of | |
134 | +Spain, Poland or Romania (which do not upload their data in our | |
135 | +system). | |
136 | +I make sure the dates are treated correctly and I create a new | |
137 | +variable "year" which is the year when the aid was awarded. | |
138 | +--- | |
139 | +`r chunk_reveal(chunk_name = "tam_clean" , | |
140 | + title = "### Bulk of the code")` | |
141 | + | |
142 | + | |
143 | +```{r ttnew, include = FALSE} | |
144 | +write_csv(df, "tam_updated.csv.gz") | |
145 | + | |
146 | +``` | |
147 | +--- | |
148 | +I save the TAM database as a compressed csv file and read a file with | |
149 | +the NACE codes | |
150 | +--- | |
151 | +`r chunk_reveal(chunk_name = "ttnew" , | |
152 | + title = "### Save the TAM database as a compressed csv")` | |
153 | +--- | |
154 | +I also read the NACE codes and descriptions from a file. | |
155 | +--- | |
156 | +```{r nace2, include = FALSE} | |
157 | +nace <- read_excel("nace_codes.xlsx") %>% | |
158 | + mutate(code=substrLeft(description,1)) | |
159 | +nace | |
160 | +``` | |
161 | +`r chunk_reveal(chunk_name = "nace2" , | |
162 | + title = "### Get the NACE codes and descriptions")` | |
163 | +--- | |
164 | +I perform some minor modifications on the tam database and I extract a | |
165 | +single-letter NACE code, which I call macro sector. I then join the | |
166 | +TAM database with the file containing the description of the single | |
167 | +letter NACE code. | |
168 | + | |
169 | +Finally, I save the data in the R friendly RDS format | |
170 | +--- | |
171 | +```{r conclusion, include = FALSE} | |
172 | +df.out <- df %>% | |
173 | + mutate(name_of_beneficiary=if_else(!is.na(beneficiary_name_english), | |
174 | + beneficiary_name_english, beneficiary_name)) %>% | |
175 | + mutate(aid_award_ga=if_else(!is.na(aid_award_ga_english), | |
176 | + aid_award_ga_english, aid_award_ga_original)) %>% | |
177 | + rename("instrument_type"="aid_award_instrument") %>% | |
178 | + mutate(macro_sector=substrLeft(beneficiary_sector, 1)) %>% | |
179 | + left_join(y=nace, by=c("macro_sector"="code")) %>% | |
180 | + select(-macro_sector) %>% | |
181 | + rename("macro_sector"="description") %>% | |
182 | + rename("case_title"="case_title_original") %>% | |
183 | + left_join(y=codes, by=c("main_procedure_type_code"="code")) %>% | |
184 | + mutate(main_procedure_type_code=meaning) %>% | |
185 | + select(-meaning) | |
186 | + | |
187 | +df.out %>% glimpse | |
188 | + | |
189 | +saveRDS(df.out,"TAM_cleaned_for_shiny.RDS") | |
190 | + | |
191 | +``` | |
192 | +`r chunk_reveal(chunk_name = "conclusion" , | |
193 | + title = "### Last Steps and I save the TAM database for the Shiny App")` | |
194 | + | |
195 | + | |
196 | + | |
197 | + | |
198 | + | |
199 | + | |
200 | + |