Revisión | 39ce1836a9db8561243039b6171b232c02071bb6 (tree) |
---|---|
Tiempo | 2024-09-18 02:45:30 |
Autor | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
I improved the script and handle better the file overwriting.
@@ -20,7 +20,7 @@ | ||
20 | 20 | if (start_from_tsv==1) { |
21 | 21 | |
22 | 22 | |
23 | -data <- open_dataset("export.tsv", | |
23 | +data <- open_dataset("../input/csv_files/export.tsv", | |
24 | 24 | format = "tsv", |
25 | 25 | skip_rows = 1, |
26 | 26 | newlines_in_values=T, |
@@ -61,12 +61,12 @@ | ||
61 | 61 | print("OK HERE") |
62 | 62 | |
63 | 63 | |
64 | - | |
64 | +remove_files_with_pattern("../input/parquet_data_input/*parquet") | |
65 | 65 | |
66 | 66 | write_dataset( |
67 | 67 | data, |
68 | 68 | format = "parquet", |
69 | - path = "./parquet_data_input/", | |
69 | + path = "../input/parquet_data_input/", | |
70 | 70 | basename_template="tam_raw-{i}.parquet" , |
71 | 71 | max_rows_per_file = 5e5 |
72 | 72 | ) |
@@ -80,13 +80,13 @@ | ||
80 | 80 | print("OK HERE2") |
81 | 81 | |
82 | 82 | |
83 | -df_tam_ini <- open_dataset("./parquet_data_input/") | |
83 | +df_tam_ini <- open_dataset("../input/parquet_data_input/") | |
84 | 84 | |
85 | 85 | |
86 | 86 | |
87 | 87 | |
88 | 88 | |
89 | -covid <- open_dataset("SA-Covid19.csv", format="csv") |> | |
89 | +covid <- open_dataset("../input/csv_files/SA-Covid19.csv", format="csv") |> | |
90 | 90 | rename("case_reference"="Case Reference") |> |
91 | 91 | select(case_reference) |> |
92 | 92 | mutate(is_covid_case="Yes") |> |
@@ -223,10 +223,12 @@ | ||
223 | 223 | } |
224 | 224 | |
225 | 225 | |
226 | +remove_files_with_pattern("../input/parquet-files/tam-original/*parquet") | |
227 | + | |
226 | 228 | write_dataset( |
227 | 229 | df_new, |
228 | 230 | format = "parquet", |
229 | - path = "./data_output/", | |
231 | + path = "../input/parquet-files/tam-original", | |
230 | 232 | basename_template="tam_database_cleaned-{i}.parquet" , |
231 | 233 | max_rows_per_file = 5e5 |
232 | 234 | ) |
@@ -236,14 +238,14 @@ | ||
236 | 238 | |
237 | 239 | |
238 | 240 | |
239 | -cases_wrong <- df_new |> | |
240 | - filter(is.na(granted_aid_absolute_eur), | |
241 | - is.na(nominal_aid_absolute_eur), | |
242 | - granted_range_eur=="0 - " | |
243 | - ) |> | |
244 | - collect() | |
241 | +## cases_wrong <- df_new |> | |
242 | +## filter(is.na(granted_aid_absolute_eur), | |
243 | +## is.na(nominal_aid_absolute_eur), | |
244 | +## granted_range_eur=="0 - " | |
245 | +## ) |> | |
246 | +## collect() | |
245 | 247 | |
246 | -save_excel(cases_wrong, "tam_errors.xlsx") | |
248 | +## save_excel(cases_wrong, "tam_errors.xlsx") | |
247 | 249 | |
248 | 250 | |
249 | 251 |