From d16c1d021b3a1de1df66ea2d9c7876f082925690 Mon Sep 17 00:00:00 2001 From: HKYoo95 Date: Sat, 18 Feb 2023 16:35:50 +0900 Subject: [PATCH 1/6] Update check_data.py --- 01-spark/check_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/01-spark/check_data.py b/01-spark/check_data.py index dfadc5f..7b0d198 100644 --- a/01-spark/check_data.py +++ b/01-spark/check_data.py @@ -1,4 +1,4 @@ import pandas as pd df=pd.read_csv('./data/fhvhv_tripdata_2020-03.csv') -print(df.head(5)) +print(df.head(5)) \ No newline at end of file From 4aa526fe4e800ed2f37365bf588b033076cab169 Mon Sep 17 00:00:00 2001 From: HKYoo95 Date: Sat, 18 Feb 2023 16:36:38 +0900 Subject: [PATCH 2/6] Update check_data.py --- 01-spark/check_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/01-spark/check_data.py b/01-spark/check_data.py index 7b0d198..d9bcd57 100644 --- a/01-spark/check_data.py +++ b/01-spark/check_data.py @@ -1,4 +1,5 @@ import pandas as pd df=pd.read_csv('./data/fhvhv_tripdata_2020-03.csv') -print(df.head(5)) \ No newline at end of file +print(df.head(5)) +print(df.head(10)) \ No newline at end of file From 36c5830481a6fdae48b870e109df23cbbcf48255 Mon Sep 17 00:00:00 2001 From: HKYoo95 Date: Sat, 18 Feb 2023 16:38:06 +0900 Subject: [PATCH 3/6] Update check_data.py --- 01-spark/check_data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/01-spark/check_data.py b/01-spark/check_data.py index d9bcd57..7b0d198 100644 --- a/01-spark/check_data.py +++ b/01-spark/check_data.py @@ -1,5 +1,4 @@ import pandas as pd df=pd.read_csv('./data/fhvhv_tripdata_2020-03.csv') -print(df.head(5)) -print(df.head(10)) \ No newline at end of file +print(df.head(5)) \ No newline at end of file From e595b362703bf801a5cd9a795ebbb3cafe30bd63 Mon Sep 17 00:00:00 2001 From: HKYoo95 Date: Wed, 22 Feb 2023 16:13:44 +0900 Subject: [PATCH 4/6] study --- 01-spark/check_data.py | 3 ++- 01-spark/count_trips.py | 9 +++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/01-spark/check_data.py b/01-spark/check_data.py index 7b0d198..3368532 100644 --- a/01-spark/check_data.py +++ b/01-spark/check_data.py @@ -1,4 +1,5 @@ import pandas as pd +import pyarrow.parquet as parquet -df=pd.read_csv('./data/fhvhv_tripdata_2020-03.csv') +df=pd.read_parquet('./data/fhvhv_tripdata_2020-03.parquet') print(df.head(5)) \ No newline at end of file diff --git a/01-spark/count_trips.py b/01-spark/count_trips.py index 77278ad..ebbb5eb 100644 --- a/01-spark/count_trips.py +++ b/01-spark/count_trips.py @@ -1,14 +1,15 @@ # 패키지를 가져오고 from pyspark import SparkConf, SparkContext +import pyarrow.parquet as parquet import pandas as pd # Spark 설정 -conf = SparkConf().setMaster("local").setAppName("uber-date-trips") +conf = SparkConf().setMaster("local[*]").setAppName("uber-date-trips") sc = SparkContext(conf=conf) # 우리가 가져올 데이터가 있는 파일 -directory = "/Users/keon/fastcampus/data-engineering/01-spark/data" -filename = "fhvhv_tripdata_2020-03.csv" +directory = "C:\projects\DataEngineering\data-engineering\01-spark\data" +filename = "fhvhv_tripdata_2020-03.parquet" # 데이터 파싱 lines = sc.textFile(f"file:///{directory}/{filename}") @@ -22,4 +23,4 @@ # 아래는 Spark코드가 아닌 일반적인 파이썬 코드 # CSV로 결과값 저장 -pd.Series(result, name="trips").to_csv("trips_date.csv") \ No newline at end of file +pd.Series(result, name="trips").to_csv("trips_date1.csv") \ No newline at end of file From e651d0d4782d465e94d2115cea9262af6cb135b4 Mon Sep 17 00:00:00 2001 From: HKYoo95 Date: Wed, 22 Feb 2023 16:21:22 +0900 Subject: [PATCH 5/6] Update count_trips.py --- 01-spark/count_trips.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/01-spark/count_trips.py b/01-spark/count_trips.py index ebbb5eb..8861b2d 100644 --- a/01-spark/count_trips.py +++ b/01-spark/count_trips.py @@ -9,6 +9,7 @@ # 우리가 가져올 데이터가 있는 파일 directory = "C:\projects\DataEngineering\data-engineering\01-spark\data" +# filename = "fhvhv_tripdata_2020-03.csv" filename = "fhvhv_tripdata_2020-03.parquet" # 데이터 파싱 @@ -23,4 +24,5 @@ # 아래는 Spark코드가 아닌 일반적인 파이썬 코드 # CSV로 결과값 저장 -pd.Series(result, name="trips").to_csv("trips_date1.csv") \ No newline at end of file +# pd.Series(result, name="trips").to_csv("trips_date.csv") +pd.Series(result, name="trips").to_csv("trips_date1.csv") From 09fe2bafafae5bd58aacf228f6b985bf2699b071 Mon Sep 17 00:00:00 2001 From: HKYoo95 Date: Thu, 23 Feb 2023 01:11:59 +0900 Subject: [PATCH 6/6] test --- 01-spark/check_data.py | 4 +++- 01-spark/count_trips.py | 3 +++ 01-spark/trips_date1.csv | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 01-spark/trips_date1.csv diff --git a/01-spark/check_data.py b/01-spark/check_data.py index 3368532..7a328da 100644 --- a/01-spark/check_data.py +++ b/01-spark/check_data.py @@ -1,5 +1,7 @@ import pandas as pd import pyarrow.parquet as parquet -df=pd.read_parquet('./data/fhvhv_tripdata_2020-03.parquet') +#df=pd.read_parquet('./data/fhvhv_tripdata_2020-03.parquet') +#df.to_csv('fhvhv_tripdata_2020-03.csv') +df=pd.read_csv('./data/fhvhv_tripdata_2020-03.csv') print(df.head(5)) \ No newline at end of file diff --git a/01-spark/count_trips.py b/01-spark/count_trips.py index 8861b2d..8ae0576 100644 --- a/01-spark/count_trips.py +++ b/01-spark/count_trips.py @@ -2,6 +2,9 @@ from pyspark import SparkConf, SparkContext import pyarrow.parquet as parquet import pandas as pd +import findspark + +findspark.init() # Spark 설정 conf = SparkConf().setMaster("local[*]").setAppName("uber-date-trips") diff --git a/01-spark/trips_date1.csv b/01-spark/trips_date1.csv new file mode 100644 index 0000000..6c443d6 --- /dev/null +++ b/01-spark/trips_date1.csv @@ -0,0 +1,34 @@ +,trips +2020-03-01,780219 +2020-02-29,5603 +2020-03-02,648912 +2020-03-03,697798 +2020-03-04,708065 +2020-03-05,731656 +2020-03-06,873517 +2020-03-07,887260 +2020-03-08,728148 +2020-03-09,628813 +2020-03-10,626369 +2020-03-11,628609 +2020-03-12,643230 +2020-03-13,661432 +2020-03-14,569556 +2020-03-15,447181 +2020-03-16,391081 +2020-03-17,312085 +2020-03-18,269258 +2020-03-19,252764 +2020-03-20,262023 +2020-03-21,215661 +2020-03-22,162004 +2020-03-23,162343 +2020-03-24,141719 +2020-03-25,141074 +2020-03-26,141644 +2020-03-27,159481 +2020-03-28,138392 +2020-03-29,115416 +2020-03-30,132537 +2020-03-31,129077 +2020-04-01,1