• R/O
  • SSH

Commit

Tags
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

Commit MetaInfo

Revisión5f02e8dbea0d074338c3042c14ad48d3e5a4a71a (tree)
Tiempo2024-03-15 21:34:46
AutorLorenzo Isella <lorenzo.isella@gmai...>
CommiterLorenzo Isella

Log Message

I cleaned up the code and I also generate a parquet file.

Cambiar Resumen

Diferencia incremental

diff -r 93416964f8e9 -r 5f02e8dbea0d R-codes/process_spanish_new_data_sequentially.R
--- a/R-codes/process_spanish_new_data_sequentially.R Fri Mar 15 12:38:28 2024 +0100
+++ b/R-codes/process_spanish_new_data_sequentially.R Fri Mar 15 13:34:46 2024 +0100
@@ -8,21 +8,11 @@
88
99 source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")
1010
11-## data <- open_dataset("./", format="csv")
1211
13-## write_dataset(
14-## data,
15-## format = "parquet",
16-## path = "~/initial",
17-## max_rows_per_file = 1e7
18-## )
12+read_seq <- 0:250
1913
2014
21-
22-read_seq <- 1:249
23-
24-
25-df_map <- read_csv("spanish-mapping.csv") |>
15+df_map <- read_csv("../spanish-mapping.csv") |>
2616 mutate(across(where(is.character),~remove_trailing_spaces(.x)))
2717
2818
@@ -46,12 +36,12 @@
4636
4737
4838
49-mm <- read_excel("../ES_matching.xlsx") %>%
50- clean_names() %>%
51- select(-translate) %>%
39+mm <- read_excel("../ES_matching.xlsx") |>
40+ clean_names() |>
41+ select(-translate) |>
5242 mutate(original=make_clean_names(original),
53- match=make_clean_names(match)) %>%
54- select(match, original) %>%
43+ match=make_clean_names(match)) |>
44+ select(match, original) |>
5545 pattern_to_pattern("na", "na_1")
5646
5747
@@ -111,13 +101,14 @@
111101
112102
113103
114-df_tot2 <- df_tot %>% rename(!!! mm2) %>%
115- select(-starts_with("na_")) %>%
116- mutate(beneficiary_country="Spain") %>%
117- mutate(across(contains("date"), ~as_date(.x))) %>%
118- mutate(year=ymd(aid_award_granted_date)) %>%
119- mutate(year=year(year)) %>%
120- mutate(nominal_aid_absolute_eur=str_replace_all(nominal_aid_absolute_eur,",",""),
104+ df_tot2 <- df_tot |>
105+ rename(!!! mm2) |>
106+ select(-starts_with("na_")) |>
107+ mutate(beneficiary_country="Spain") |>
108+ mutate(across(contains("date"), ~as_date(.x))) |>
109+ mutate(year=ymd(aid_award_granted_date)) |>
110+ mutate(year=year(year)) |>
111+ mutate(nominal_aid_absolute_eur=str_replace_all(nominal_aid_absolute_eur,",",""),
121112 granted_aid_absolute_eur=str_replace_all(granted_aid_absolute_eur,",","")) |>
122113 mutate(across(contains("aid_absolute"), ~as.numeric(.x))) |>
123114 mutate(across(contains("date"), ~ymd(.x)))|>
@@ -147,15 +138,48 @@
147138 write_csv(df_tot2, fname.out)
148139
149140 }
150-
151-## saveRDS(df_tot2, "tam_ES_matched.RDS")
152-
153-## write_csv(df_tot2, "spanish_tam_cleaned.csv.gz")
154141
155142
156-## save_excel(df_tot2, "spanish_tam.xlsx")
157143
158-## write_parquet(df_tot2, "spanish_tam.parquet")
144+system("csvstack processed_data*csv > final_spain.csv")
145+
146+data <- open_dataset("final_spain.csv",
147+ format = "csv",
148+ skip_rows = 1,
149+ schema = schema(
150+ aid_award_ga_original = string(),
151+ aid_award_reference = string(),
152+ aid_award_objective = string(),
153+ aid_award_instrument = string(),
154+ beneficiary_type = string(),
155+ aid_award_granted_date = string(),
156+ aid_award_published_date= string(),
157+ beneficiary_name= string(),
158+ nominal_aid_absolute_eur= double(),
159+ granted_aid_absolute_eur = double(),
160+ beneficiary_region = string(),
161+ beneficiary_sector = string(),
162+ case_reference = string(),
163+ beneficiary_country = string(),
164+ year= double() ,
165+ nominal_value_extended_eur = double(),
166+ granted_value_extended_eur = double(),
167+ is_covid_case = string(),
168+ national_identification= string()
169+ )
170+)
171+
172+## write_dataset(
173+## data,
174+## format = "parquet",
175+## path = ".",
176+## max_rows_per_file = 1e7
177+## )
178+
179+write_parquet(
180+ data, "final_spain.parquet")
181+
182+
159183
160184
161185