Skip to content

Commit dcac253

Browse files
committed
experimenting
1 parent 4ff67ff commit dcac253

4 files changed

Lines changed: 54 additions & 3 deletions

File tree

analysis/example.do

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// stata cannot handle compressed csv files directly, so unzip first to a plain csv file
2+
!gunzip output/input.csv.gz
3+
4+
// now import the uncompressed csv using delimited
5+
import delimited using output/input.csv
6+
7+
8+
// your analysis code goes here
9+
10+
11+
// all dta file outputs should be saved using `gzsave` and a .dta.gz extension
12+
// In subsequent actions, use `gzuse` to load them.
13+
gzsave output/stata.dta.gz

analysis/example.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import pandas as pd
2+
import pyarrow.feather
3+
4+
df = pd.read_csv("output/input.csv.gz")
5+
6+
7+
# feather files are compressed by default in python
8+
df.to_feather("output/python.feather.lz4")
9+
pyarrow.feather.write_feather(df, "output/python.feather.raw", compression="uncompressed")
10+
pyarrow.feather.write_feather(df, "output/python.feather.zstd", compression="zstd")

analysis/example.r

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# read compressed .csv file
2+
df <- readr::read_csv("output/input.csv.gz")
3+
4+
# write a .feather file output
5+
arrow::write_feather(df, "output/r.feather.lz4")
6+
arrow::write_feather(df, "output/r.feather.raw", compression = "uncompressed")
7+
arrow::write_feather(df, "output/r.feather.zstd", compression = "zstd")

project.yaml

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,33 @@
11
version: '3.0'
22

33
expectations:
4-
population_size: 1000
4+
population_size: 1000000
55

66
actions:
77

88
generate_study_population:
9-
run: cohortextractor:latest generate_cohort --study-definition study_definition
9+
run: cohortextractor:latest generate_cohort --output-format csv.gz --study-definition study_definition
1010
outputs:
1111
highly_sensitive:
12-
cohort: output/input.csv
12+
cohort: output/input.csv.gz
13+
14+
python_example:
15+
run: python:latest analysis/example.py
16+
needs: [generate_study_population]
17+
outputs:
18+
highly_sensitive:
19+
cohort: output/python.feather*
20+
21+
stata_example:
22+
run: stata-mp:latest analysis/example.do
23+
needs: [generate_study_population]
24+
outputs:
25+
highly_sensitive:
26+
cohort: output/stata.dta.gz
27+
28+
r_example:
29+
run: r:latest analysis/example.r
30+
needs: [generate_study_population]
31+
outputs:
32+
highly_sensitive:
33+
cohort: output/r.feather*

0 commit comments

Comments
 (0)