Revisión | 5f02e8dbea0d074338c3042c14ad48d3e5a4a71a (tree) |
---|---|
Tiempo | 2024-03-15 21:34:46 |
Autor | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
I cleaned up the code and I also generate a parquet file.
@@ -8,21 +8,11 @@ | ||
8 | 8 | |
9 | 9 | source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R") |
10 | 10 | |
11 | -## data <- open_dataset("./", format="csv") | |
12 | 11 | |
13 | -## write_dataset( | |
14 | -## data, | |
15 | -## format = "parquet", | |
16 | -## path = "~/initial", | |
17 | -## max_rows_per_file = 1e7 | |
18 | -## ) | |
12 | +read_seq <- 0:250 | |
19 | 13 | |
20 | 14 | |
21 | - | |
22 | -read_seq <- 1:249 | |
23 | - | |
24 | - | |
25 | -df_map <- read_csv("spanish-mapping.csv") |> | |
15 | +df_map <- read_csv("../spanish-mapping.csv") |> | |
26 | 16 | mutate(across(where(is.character),~remove_trailing_spaces(.x))) |
27 | 17 | |
28 | 18 |
@@ -46,12 +36,12 @@ | ||
46 | 36 | |
47 | 37 | |
48 | 38 | |
49 | -mm <- read_excel("../ES_matching.xlsx") %>% | |
50 | - clean_names() %>% | |
51 | - select(-translate) %>% | |
39 | +mm <- read_excel("../ES_matching.xlsx") |> | |
40 | + clean_names() |> | |
41 | + select(-translate) |> | |
52 | 42 | mutate(original=make_clean_names(original), |
53 | - match=make_clean_names(match)) %>% | |
54 | - select(match, original) %>% | |
43 | + match=make_clean_names(match)) |> | |
44 | + select(match, original) |> | |
55 | 45 | pattern_to_pattern("na", "na_1") |
56 | 46 | |
57 | 47 |
@@ -111,13 +101,14 @@ | ||
111 | 101 | |
112 | 102 | |
113 | 103 | |
114 | -df_tot2 <- df_tot %>% rename(!!! mm2) %>% | |
115 | - select(-starts_with("na_")) %>% | |
116 | - mutate(beneficiary_country="Spain") %>% | |
117 | - mutate(across(contains("date"), ~as_date(.x))) %>% | |
118 | - mutate(year=ymd(aid_award_granted_date)) %>% | |
119 | - mutate(year=year(year)) %>% | |
120 | - mutate(nominal_aid_absolute_eur=str_replace_all(nominal_aid_absolute_eur,",",""), | |
104 | + df_tot2 <- df_tot |> | |
105 | + rename(!!! mm2) |> | |
106 | + select(-starts_with("na_")) |> | |
107 | + mutate(beneficiary_country="Spain") |> | |
108 | + mutate(across(contains("date"), ~as_date(.x))) |> | |
109 | + mutate(year=ymd(aid_award_granted_date)) |> | |
110 | + mutate(year=year(year)) |> | |
111 | + mutate(nominal_aid_absolute_eur=str_replace_all(nominal_aid_absolute_eur,",",""), | |
121 | 112 | granted_aid_absolute_eur=str_replace_all(granted_aid_absolute_eur,",","")) |> |
122 | 113 | mutate(across(contains("aid_absolute"), ~as.numeric(.x))) |> |
123 | 114 | mutate(across(contains("date"), ~ymd(.x)))|> |
@@ -147,15 +138,48 @@ | ||
147 | 138 | write_csv(df_tot2, fname.out) |
148 | 139 | |
149 | 140 | } |
150 | - | |
151 | -## saveRDS(df_tot2, "tam_ES_matched.RDS") | |
152 | - | |
153 | -## write_csv(df_tot2, "spanish_tam_cleaned.csv.gz") | |
154 | 141 | |
155 | 142 | |
156 | -## save_excel(df_tot2, "spanish_tam.xlsx") | |
157 | 143 | |
158 | -## write_parquet(df_tot2, "spanish_tam.parquet") | |
144 | +system("csvstack processed_data*csv > final_spain.csv") | |
145 | + | |
146 | +data <- open_dataset("final_spain.csv", | |
147 | + format = "csv", | |
148 | + skip_rows = 1, | |
149 | + schema = schema( | |
150 | + aid_award_ga_original = string(), | |
151 | + aid_award_reference = string(), | |
152 | + aid_award_objective = string(), | |
153 | + aid_award_instrument = string(), | |
154 | + beneficiary_type = string(), | |
155 | + aid_award_granted_date = string(), | |
156 | + aid_award_published_date= string(), | |
157 | + beneficiary_name= string(), | |
158 | + nominal_aid_absolute_eur= double(), | |
159 | + granted_aid_absolute_eur = double(), | |
160 | + beneficiary_region = string(), | |
161 | + beneficiary_sector = string(), | |
162 | + case_reference = string(), | |
163 | + beneficiary_country = string(), | |
164 | + year= double() , | |
165 | + nominal_value_extended_eur = double(), | |
166 | + granted_value_extended_eur = double(), | |
167 | + is_covid_case = string(), | |
168 | + national_identification= string() | |
169 | + ) | |
170 | +) | |
171 | + | |
172 | +## write_dataset( | |
173 | +## data, | |
174 | +## format = "parquet", | |
175 | +## path = ".", | |
176 | +## max_rows_per_file = 1e7 | |
177 | +## ) | |
178 | + | |
179 | +write_parquet( | |
180 | + data, "final_spain.parquet") | |
181 | + | |
182 | + | |
159 | 183 | |
160 | 184 | |
161 | 185 |