diff --git a/01-spark/check_data.py b/01-spark/check_data.py index dfadc5f..7a328da 100644 --- a/01-spark/check_data.py +++ b/01-spark/check_data.py @@ -1,4 +1,7 @@ import pandas as pd +import pyarrow.parquet as parquet +#df=pd.read_parquet('./data/fhvhv_tripdata_2020-03.parquet') +#df.to_csv('fhvhv_tripdata_2020-03.csv') df=pd.read_csv('./data/fhvhv_tripdata_2020-03.csv') -print(df.head(5)) +print(df.head(5)) \ No newline at end of file diff --git a/01-spark/count_trips.py b/01-spark/count_trips.py index 77278ad..8ae0576 100644 --- a/01-spark/count_trips.py +++ b/01-spark/count_trips.py @@ -1,14 +1,19 @@ # 패키지를 가져오고 from pyspark import SparkConf, SparkContext +import pyarrow.parquet as parquet import pandas as pd +import findspark + +findspark.init() # Spark 설정 -conf = SparkConf().setMaster("local").setAppName("uber-date-trips") +conf = SparkConf().setMaster("local[*]").setAppName("uber-date-trips") sc = SparkContext(conf=conf) # 우리가 가져올 데이터가 있는 파일 -directory = "/Users/keon/fastcampus/data-engineering/01-spark/data" -filename = "fhvhv_tripdata_2020-03.csv" +directory = "C:\projects\DataEngineering\data-engineering\01-spark\data" +# filename = "fhvhv_tripdata_2020-03.csv" +filename = "fhvhv_tripdata_2020-03.parquet" # 데이터 파싱 lines = sc.textFile(f"file:///{directory}/{filename}") @@ -22,4 +27,5 @@ # 아래는 Spark코드가 아닌 일반적인 파이썬 코드 # CSV로 결과값 저장 -pd.Series(result, name="trips").to_csv("trips_date.csv") \ No newline at end of file +# pd.Series(result, name="trips").to_csv("trips_date.csv") +pd.Series(result, name="trips").to_csv("trips_date1.csv") diff --git a/01-spark/trips_date1.csv b/01-spark/trips_date1.csv new file mode 100644 index 0000000..6c443d6 --- /dev/null +++ b/01-spark/trips_date1.csv @@ -0,0 +1,34 @@ +,trips +2020-03-01,780219 +2020-02-29,5603 +2020-03-02,648912 +2020-03-03,697798 +2020-03-04,708065 +2020-03-05,731656 +2020-03-06,873517 +2020-03-07,887260 +2020-03-08,728148 +2020-03-09,628813 +2020-03-10,626369 +2020-03-11,628609 +2020-03-12,643230 +2020-03-13,661432 +2020-03-14,569556 +2020-03-15,447181 +2020-03-16,391081 +2020-03-17,312085 +2020-03-18,269258 +2020-03-19,252764 +2020-03-20,262023 +2020-03-21,215661 +2020-03-22,162004 +2020-03-23,162343 +2020-03-24,141719 +2020-03-25,141074 +2020-03-26,141644 +2020-03-27,159481 +2020-03-28,138392 +2020-03-29,115416 +2020-03-30,132537 +2020-03-31,129077 +2020-04-01,1