• R/O
  • SSH

Commit

Tags
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

Commit MetaInfo

Revisión39ce1836a9db8561243039b6171b232c02071bb6 (tree)
Tiempo2024-09-18 02:45:30
AutorLorenzo Isella <lorenzo.isella@gmai...>
CommiterLorenzo Isella

Log Message

I improved the script and handle better the file overwriting.

Cambiar Resumen

Diferencia incremental

diff -r b1a2d9f03553 -r 39ce1836a9db R-codes/create_tam_parquet.R
--- a/R-codes/create_tam_parquet.R Tue Sep 17 14:40:35 2024 +0200
+++ b/R-codes/create_tam_parquet.R Tue Sep 17 19:45:30 2024 +0200
@@ -20,7 +20,7 @@
2020 if (start_from_tsv==1) {
2121
2222
23-data <- open_dataset("export.tsv",
23+data <- open_dataset("../input/csv_files/export.tsv",
2424 format = "tsv",
2525 skip_rows = 1,
2626 newlines_in_values=T,
@@ -61,12 +61,12 @@
6161 print("OK HERE")
6262
6363
64-
64+remove_files_with_pattern("../input/parquet_data_input/*parquet")
6565
6666 write_dataset(
6767 data,
6868 format = "parquet",
69- path = "./parquet_data_input/",
69+ path = "../input/parquet_data_input/",
7070 basename_template="tam_raw-{i}.parquet" ,
7171 max_rows_per_file = 5e5
7272 )
@@ -80,13 +80,13 @@
8080 print("OK HERE2")
8181
8282
83-df_tam_ini <- open_dataset("./parquet_data_input/")
83+df_tam_ini <- open_dataset("../input/parquet_data_input/")
8484
8585
8686
8787
8888
89-covid <- open_dataset("SA-Covid19.csv", format="csv") |>
89+covid <- open_dataset("../input/csv_files/SA-Covid19.csv", format="csv") |>
9090 rename("case_reference"="Case Reference") |>
9191 select(case_reference) |>
9292 mutate(is_covid_case="Yes") |>
@@ -223,10 +223,12 @@
223223 }
224224
225225
226+remove_files_with_pattern("../input/parquet-files/tam-original/*parquet")
227+
226228 write_dataset(
227229 df_new,
228230 format = "parquet",
229- path = "./data_output/",
231+ path = "../input/parquet-files/tam-original",
230232 basename_template="tam_database_cleaned-{i}.parquet" ,
231233 max_rows_per_file = 5e5
232234 )
@@ -236,14 +238,14 @@
236238
237239
238240
239-cases_wrong <- df_new |>
240- filter(is.na(granted_aid_absolute_eur),
241- is.na(nominal_aid_absolute_eur),
242- granted_range_eur=="0 - "
243- ) |>
244- collect()
241+## cases_wrong <- df_new |>
242+## filter(is.na(granted_aid_absolute_eur),
243+## is.na(nominal_aid_absolute_eur),
244+## granted_range_eur=="0 - "
245+## ) |>
246+## collect()
245247
246-save_excel(cases_wrong, "tam_errors.xlsx")
248+## save_excel(cases_wrong, "tam_errors.xlsx")
247249
248250
249251