diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d56657a --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..2cdf9da --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/pandas_task.iml b/.idea/pandas_task.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/pandas_task.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/homework.py b/homework.py new file mode 100644 index 0000000..fca7e0e --- /dev/null +++ b/homework.py @@ -0,0 +1,33 @@ +import pandas + +works = pandas.read_csv("works.csv").dropna() + + +def count(field1, field2, jobs): + res = 0 + for f1, f2 in zip(jobs[field1], jobs[field2]): + if not comp(f1, f2) and not comp(f2, f1): + res += 1 + return res + + +def comp(f1, f2): + array = f1.lower().replace('-', ' ').split() + for word in array: + if word in f2.lower(): + return True + return False + + +result = count("jobTitle", "qualification", works) +print("Из {} людей не совпадают профессия и должность у {}".format(works.shape[0], result)) + +print("\nТоп образований людей для менеджеров") +print( + works[works['jobTitle'].str.lower().str.contains('менеджер'[:-2])]['qualification'].str.lower().value_counts().head( + 5)) + +print("\nТоп должностей людей, которые по диплому являются инженерами") +print( + works[works['jobTitle'].str.lower().str.contains('инженер'[:-2])]['qualification'].str.lower().value_counts().head( + 5)) diff --git a/proj.py b/proj.py new file mode 100644 index 0000000..fe31781 --- /dev/null +++ b/proj.py @@ -0,0 +1,68 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +works = pd.read_csv("works.csv") +print(works['skills'].str.lower().str.contains('python|питон')) + +works = pd.read_csv("works.csv") +head = works.head(5) +print(head) + +tail = works.tail(5) +print(tail) +print(works.shape[0]) +print(len(works.index)) + +print(works[works['gender'] == 'Мужской'].shape[0]) +print((works['gender'] == 'Женский').sum()) +print(works['gender'].value_counts()) + +print(works['skills'].notnull().sum()) +print(works.info()) +print(works['skills'].count()) + +print(works[works['skills'].notnull()]['skills']) +print(works['skills'].dropna()) +print(works.query("skills == skills")["skills"]) +print(works.query("salary == 15000")) +edu = 'Высшее' +gen = 'Женский' +print(works.query("educationType == @edu and gender == @gen")[['salary', 'educationType','gender']]) + +mask = works["skills"].str.lower().str.contains("python|питон") & works["skills"].notnull() +print(works[mask]["salary"]) + +percentiles = np.linspace(.1, 1, 10) + +gen = "Мужской" +men_salary = works.query('gender == @gen').quantile(percentiles) +fig, ax = plt.subplots() +ax.plot(percentiles, men_salary) +plt.xlabel('Перцентили') +plt.ylabel('Зарплата мужчин') +plt.show() + +gen = "Женский" +women_salary = works.query('gender == @gen').quantile(percentiles) +fig, ax = plt.subplots() +ax.plot(percentiles, women_salary) +plt.xlabel('Перцентили') +plt.ylabel('Зарплата женщин') +plt.show() + +gen = "Мужской" +men_salary = works.query('gender == @gen').groupby("educationType").agg("mean").reset_index() +men = men_salary['salary'].values +gen = "Женский" +women_salary = works.query('gender == @gen').groupby("educationType").agg("mean").reset_index() +women = women_salary['salary'].values + +types = men_salary["educationType"].values +id = np.arange(len(types)) + +plt.bar(id - 0.2, men, 0.4, color="g", label = "Средняя зарплата мужчин") +plt.bar(id + 0.2, women, 0.4, color="y", label = "Средняя зарплата женщин") +plt.xticks(id, types, rotation=45) +plt.legend() +plt.show() \ No newline at end of file