From 77dccb9103fabeba734516c0853521ae8da7608f Mon Sep 17 00:00:00 2001 From: romcq Date: Sat, 11 Dec 2021 18:27:12 +0500 Subject: [PATCH 1/2] classwork --- .idea/.gitignore | 3 +++ .../inspectionProfiles/profiles_settings.xml | 6 +++++ .idea/misc.xml | 4 +++ .idea/modules.xml | 8 ++++++ .idea/pandas_task.iml | 8 ++++++ .idea/vcs.xml | 6 +++++ proj.py | 25 +++++++++++++++++++ 7 files changed, 60 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/pandas_task.iml create mode 100644 .idea/vcs.xml create mode 100644 proj.py diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d56657a --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..2cdf9da --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/pandas_task.iml b/.idea/pandas_task.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/pandas_task.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/proj.py b/proj.py new file mode 100644 index 0000000..a98fc10 --- /dev/null +++ b/proj.py @@ -0,0 +1,25 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as nlp + +works = pd.read_csv('works.csv') +# print(works.head(10)) +# print() +# print(works.tail(10)) +# print() +# print(works.info()) +# print() +print(works.shape[0]) +print() +# print((works.gender == 'Мужской').sum()) +# print() +# print(works[works.gender == 'Женский'].shape[0]) +# print() +# print((works['skills'].notna()).sum()) +# print() +# print(works.info) +# print() +# print(works['skills'].count()) +# print() +# print(works[works['skills'].notna()]['skills']) +print(works['skills'].dropna()) From b0a5b45c6b549e01b84d0880f2de970210c020ee Mon Sep 17 00:00:00 2001 From: romcq Date: Sat, 25 Dec 2021 11:05:13 +0500 Subject: [PATCH 2/2] pandas work --- homework.py | 33 +++++++++++++++++++++ proj.py | 85 ++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 97 insertions(+), 21 deletions(-) create mode 100644 homework.py diff --git a/homework.py b/homework.py new file mode 100644 index 0000000..fca7e0e --- /dev/null +++ b/homework.py @@ -0,0 +1,33 @@ +import pandas + +works = pandas.read_csv("works.csv").dropna() + + +def count(field1, field2, jobs): + res = 0 + for f1, f2 in zip(jobs[field1], jobs[field2]): + if not comp(f1, f2) and not comp(f2, f1): + res += 1 + return res + + +def comp(f1, f2): + array = f1.lower().replace('-', ' ').split() + for word in array: + if word in f2.lower(): + return True + return False + + +result = count("jobTitle", "qualification", works) +print("Из {} людей не совпадают профессия и должность у {}".format(works.shape[0], result)) + +print("\nТоп образований людей для менеджеров") +print( + works[works['jobTitle'].str.lower().str.contains('менеджер'[:-2])]['qualification'].str.lower().value_counts().head( + 5)) + +print("\nТоп должностей людей, которые по диплому являются инженерами") +print( + works[works['jobTitle'].str.lower().str.contains('инженер'[:-2])]['qualification'].str.lower().value_counts().head( + 5)) diff --git a/proj.py b/proj.py index a98fc10..fe31781 100644 --- a/proj.py +++ b/proj.py @@ -1,25 +1,68 @@ import pandas as pd import numpy as np -import matplotlib.pyplot as nlp - -works = pd.read_csv('works.csv') -# print(works.head(10)) -# print() -# print(works.tail(10)) -# print() -# print(works.info()) -# print() +import matplotlib.pyplot as plt + +works = pd.read_csv("works.csv") +print(works['skills'].str.lower().str.contains('python|питон')) + +works = pd.read_csv("works.csv") +head = works.head(5) +print(head) + +tail = works.tail(5) +print(tail) print(works.shape[0]) -print() -# print((works.gender == 'Мужской').sum()) -# print() -# print(works[works.gender == 'Женский'].shape[0]) -# print() -# print((works['skills'].notna()).sum()) -# print() -# print(works.info) -# print() -# print(works['skills'].count()) -# print() -# print(works[works['skills'].notna()]['skills']) +print(len(works.index)) + +print(works[works['gender'] == 'Мужской'].shape[0]) +print((works['gender'] == 'Женский').sum()) +print(works['gender'].value_counts()) + +print(works['skills'].notnull().sum()) +print(works.info()) +print(works['skills'].count()) + +print(works[works['skills'].notnull()]['skills']) print(works['skills'].dropna()) +print(works.query("skills == skills")["skills"]) +print(works.query("salary == 15000")) +edu = 'Высшее' +gen = 'Женский' +print(works.query("educationType == @edu and gender == @gen")[['salary', 'educationType','gender']]) + +mask = works["skills"].str.lower().str.contains("python|питон") & works["skills"].notnull() +print(works[mask]["salary"]) + +percentiles = np.linspace(.1, 1, 10) + +gen = "Мужской" +men_salary = works.query('gender == @gen').quantile(percentiles) +fig, ax = plt.subplots() +ax.plot(percentiles, men_salary) +plt.xlabel('Перцентили') +plt.ylabel('Зарплата мужчин') +plt.show() + +gen = "Женский" +women_salary = works.query('gender == @gen').quantile(percentiles) +fig, ax = plt.subplots() +ax.plot(percentiles, women_salary) +plt.xlabel('Перцентили') +plt.ylabel('Зарплата женщин') +plt.show() + +gen = "Мужской" +men_salary = works.query('gender == @gen').groupby("educationType").agg("mean").reset_index() +men = men_salary['salary'].values +gen = "Женский" +women_salary = works.query('gender == @gen').groupby("educationType").agg("mean").reset_index() +women = women_salary['salary'].values + +types = men_salary["educationType"].values +id = np.arange(len(types)) + +plt.bar(id - 0.2, men, 0.4, color="g", label = "Средняя зарплата мужчин") +plt.bar(id + 0.2, women, 0.4, color="y", label = "Средняя зарплата женщин") +plt.xticks(id, types, rotation=45) +plt.legend() +plt.show() \ No newline at end of file