diff --git a/1.py b/1.py new file mode 100644 index 0000000..bf37553 --- /dev/null +++ b/1.py @@ -0,0 +1,6 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +df = pd.read_csv('works.csv') +rows_count = df.shape[0] +print("Всего записей:", rows_count) \ No newline at end of file diff --git a/2.py b/2.py new file mode 100644 index 0000000..a21ed6f --- /dev/null +++ b/2.py @@ -0,0 +1,10 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +df = pd.read_csv('works.csv') +def get_rows_with_gender(df: pd.DataFrame, gender: str) -> pd.DataFrame: + return df[df["gender"] == gender] +females = get_rows_with_gender(df, "Женский") +males = get_rows_with_gender(df, "Мужской") +print("Всего женщин:", females.shape[0]) +print("Всего мужчин:", males.shape[0]) \ No newline at end of file diff --git a/2021-12-17_12-59-17.png b/2021-12-17_12-59-17.png new file mode 100644 index 0000000..6e5e35e Binary files /dev/null and b/2021-12-17_12-59-17.png differ diff --git a/2021-12-17_13-10-21.png b/2021-12-17_13-10-21.png new file mode 100644 index 0000000..c09aad7 Binary files /dev/null and b/2021-12-17_13-10-21.png differ diff --git a/2021-12-17_13-10-47.png b/2021-12-17_13-10-47.png new file mode 100644 index 0000000..5049d82 Binary files /dev/null and b/2021-12-17_13-10-47.png differ diff --git a/2021-12-17_13-17-11.png b/2021-12-17_13-17-11.png new file mode 100644 index 0000000..0b63a81 Binary files /dev/null and b/2021-12-17_13-17-11.png differ diff --git a/2021-12-17_13-17-31.png b/2021-12-17_13-17-31.png new file mode 100644 index 0000000..2150290 Binary files /dev/null and b/2021-12-17_13-17-31.png differ diff --git a/2021-12-17_14-32-59.png b/2021-12-17_14-32-59.png new file mode 100644 index 0000000..0201839 Binary files /dev/null and b/2021-12-17_14-32-59.png differ diff --git a/2021-12-17_14-34-26.png b/2021-12-17_14-34-26.png new file mode 100644 index 0000000..2964764 Binary files /dev/null and b/2021-12-17_14-34-26.png differ diff --git a/2021-12-17_14-38-53.png b/2021-12-17_14-38-53.png new file mode 100644 index 0000000..842890e Binary files /dev/null and b/2021-12-17_14-38-53.png differ diff --git a/3.py b/3.py new file mode 100644 index 0000000..47f3d7a --- /dev/null +++ b/3.py @@ -0,0 +1,11 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +df = pd.read_csv('works.csv') +def get_rows_with_gender(df: pd.DataFrame, gender: str) -> pd.DataFrame: + return df[df["gender"] == gender] +females = get_rows_with_gender(df, "Женский") +males = get_rows_with_gender(df, "Мужской") +no_none_skills_count = df["skills"].count() + +print("Значений в столбце skills не NAN:", no_none_skills_count) \ No newline at end of file diff --git a/4.py b/4.py new file mode 100644 index 0000000..0a39f40 --- /dev/null +++ b/4.py @@ -0,0 +1,10 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +df = pd.read_csv('works.csv') +def get_rows_with_gender(df: pd.DataFrame, gender: str) -> pd.DataFrame: + return df[df["gender"] == gender] +females = get_rows_with_gender(df, "Женский") +males = get_rows_with_gender(df, "Мужской") +skills = df[df["skills"].notna()] +print("Все заполненные скиллы:\n", skills["skills"]) diff --git a/5.py b/5.py new file mode 100644 index 0000000..d061206 --- /dev/null +++ b/5.py @@ -0,0 +1,11 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + + +works = pd.read_csv("works.csv") +head = works.head(5) +tail = works.tail(5) + +skills_bool = works["skills"].str.lower().str.contains("python|питон") & works["skills"].notnull() +print(works[skills_bool]["salary"]) diff --git a/6.py b/6.py new file mode 100644 index 0000000..f3a7b3f --- /dev/null +++ b/6.py @@ -0,0 +1,11 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +df = pd.read_csv('works.csv') +def get_rows_with_gender(df: pd.DataFrame, gender: str) -> pd.DataFrame: + return df[df["gender"] == gender] +females = get_rows_with_gender(df, "Женский") +males = get_rows_with_gender(df, "Мужской") +pr = [i / 10 for i in range(1, 11)] +print("Женщины:\n", females["salary"].quantile(pr)) +print("Мужчины\n", males["salary"].quantile(pr)) \ No newline at end of file diff --git a/7.py b/7.py new file mode 100644 index 0000000..5f4679c --- /dev/null +++ b/7.py @@ -0,0 +1,20 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as mp + +works = pd.read_csv("works.csv") +men_salary = works.query("gender == 'Мужской'").groupby("educationType").agg("mean").reset_index() +women_salary = works.query("gender == 'Женский'").groupby("educationType").agg("mean").reset_index() + +educationTypes = men_salary["educationType"].values +men_salaries = men_salary["salary"].values +women_salary = women_salary["salary"].values + +index = np.arange(len(educationTypes)) + +bw = 0.4 +mp.bar(index-bw/2, men_salaries, bw, color="b", label="Средняя зарплата мужчин") +mp.bar(index+bw/2, women_salary, bw, color="r", label="Средняя зарплата женщин") +mp.xticks(index, educationTypes, rotation=45) +mp.legend() +mp.show() \ No newline at end of file diff --git a/8.py b/8.py new file mode 100644 index 0000000..7a7de53 --- /dev/null +++ b/8.py @@ -0,0 +1,30 @@ +import pandas as pd + + +def get_match_count(first_list, second_list): + return len(list((filter(lambda x: contains(x[0], x[1]) or contains(x[1], x[0]), zip(first_list, second_list))))) + + +def contains(sub_text, text): + words = sub_text.replace('-', ' ').split(' ') + for word in words: + if word in text: + return True + return False + + +def get_top(source, search_field, return_field, value): + return source[source[search_field].str.contains(value)][return_field].value_counts().head(5) + + +data = pd.read_csv('works.csv').dropna().apply(lambda x: x.astype(str).str.lower()) +count = len(data) +mismatch_count = count - get_match_count(data["jobTitle"], data["qualification"]) + +print(f"Всего людей: {count}.") +print(f"Людей с несовпадающими профессией и должностью: {mismatch_count}.") +print(f"Что составляет {mismatch_count / count:.0%} от общего числа.") +print("\nТоп 5 квалификаций менеджеров:") +print(get_top(data, "jobTitle", "qualification", "менеджер")) +print("\nТоп 5 должностей инженеров:") +print(get_top(data, "qualification", "jobTitle", "инженер")) \ No newline at end of file