Revisión | 3387593ddd7bde7d7c045fd0eab12f8a90329889 (tree) |
---|---|
Tiempo | 2024-10-09 00:29:55 |
Autor | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
A code to generate some synthetic data.
@@ -0,0 +1,55 @@ | ||
1 | +rm(list=ls()) | |
2 | +library(tidyverse) | |
3 | +library(janitor) | |
4 | +library(charlatan) | |
5 | + | |
6 | +source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R") | |
7 | + | |
8 | +set.seed(1234) | |
9 | + | |
10 | +nn <- ch_name(30) |> | |
11 | + sort() | |
12 | + | |
13 | + | |
14 | +ll <- tibble(x= sample(nn, 100e3, replace=T, prob=(1:30)/sum(1:30))) |> | |
15 | + group_by(x) |> | |
16 | + group_split() | |
17 | + | |
18 | +p <- runif(30,0,1) ## because the failure probability is 1-success probability | |
19 | + | |
20 | +df_p <- tibble(real_success_rate=p, group=nn) | |
21 | + | |
22 | +res <- map2(ll,p, \(z,y) tibble(x=rbinom(nrow(z),1, y))) | |
23 | + | |
24 | +df_ll <- ll |> | |
25 | + list_to_df() |> | |
26 | + rename("name"="x") | |
27 | + | |
28 | +df_res <- res |> | |
29 | + list_to_df() |> | |
30 | + rename("success"="x") ## |> | |
31 | + ## mutate(failure=round(failure, 0)) | |
32 | + | |
33 | + | |
34 | +test <- df_res |> | |
35 | + group_by(source) |> | |
36 | + summarise(n=n(), mean_success=mean(success)) |> | |
37 | + ungroup() |> | |
38 | + arrange(mean_success) | |
39 | + | |
40 | + | |
41 | +df_out <- tibble(name=df_ll$name, success=df_res$success) |> | |
42 | + left_join(y=df_p, by=c("name"="group")) | |
43 | + | |
44 | +test2 <- df_out |> | |
45 | + group_by(name) |> | |
46 | + summarise(n=n(), mean_success=mean(success), | |
47 | + real_rate=real_success_rate[1]) |> | |
48 | + ungroup() |> | |
49 | + arrange(mean_success) | |
50 | + | |
51 | + | |
52 | +write_csv(df_out, "artificial_data.csv.gz") | |
53 | + | |
54 | + | |
55 | +print("So far so good") |