experimenting

bloodearnest · bloodearnest · commit dcac25366226 · 2022-03-16T08:30:15.000Z
diff --git a/analysis/example.do b/analysis/example.do
@@ -0,0 +1,13 @@
+// stata cannot handle compressed csv files directly, so unzip first to a plain csv file
+!gunzip output/input.csv.gz
+
+// now import the uncompressed csv using delimited
+import delimited using output/input.csv
+
+
+// your analysis code goes here
+
+
+// all dta file outputs should be saved using `gzsave` and a .dta.gz extension
+// In subsequent actions, use `gzuse` to load them.
+gzsave output/stata.dta.gz
diff --git a/analysis/example.py b/analysis/example.py
@@ -0,0 +1,10 @@
+import pandas as pd
+import pyarrow.feather
+
+df = pd.read_csv("output/input.csv.gz")
+
+
+# feather files are compressed by default in python
+df.to_feather("output/python.feather.lz4")
+pyarrow.feather.write_feather(df, "output/python.feather.raw", compression="uncompressed")
+pyarrow.feather.write_feather(df, "output/python.feather.zstd", compression="zstd")
diff --git a/analysis/example.r b/analysis/example.r
@@ -0,0 +1,7 @@
+# read compressed .csv file
+df <- readr::read_csv("output/input.csv.gz")
+
+# write a .feather file output
+arrow::write_feather(df, "output/r.feather.lz4")
+arrow::write_feather(df, "output/r.feather.raw", compression = "uncompressed")
+arrow::write_feather(df, "output/r.feather.zstd", compression = "zstd")
diff --git a/project.yaml b/project.yaml
@@ -1,12 +1,33 @@
 version: '3.0'
 
 expectations:
-  population_size: 1000
+  population_size: 1000000
 
 actions:
 
   generate_study_population:
-    run: cohortextractor:latest generate_cohort --study-definition study_definition
+    run: cohortextractor:latest generate_cohort --output-format csv.gz --study-definition study_definition
     outputs:
       highly_sensitive:
-        cohort: output/input.csv
+        cohort: output/input.csv.gz
+
+  python_example:
+    run: python:latest analysis/example.py
+    needs: [generate_study_population]
+    outputs:
+      highly_sensitive:
+        cohort: output/python.feather*
+
+  stata_example:
+    run: stata-mp:latest analysis/example.do
+    needs: [generate_study_population]
+    outputs:
+      highly_sensitive:
+        cohort: output/stata.dta.gz
+
+  r_example:
+    run: r:latest analysis/example.r
+    needs: [generate_study_population]
+    outputs:
+      highly_sensitive:
+        cohort: output/r.feather*