gesturebot/classification.py at main · danielhufnagle/gesturebot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import mediapipe as mp
import cv2
import numpy as np
import tensorflow as tf

# setup for mediapipe detection
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands # hand model
mp_holistic = mp.solutions.holistic

def detection(image, model):
    # handles detection of hands and hand landmark positions
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

def draw_landmarks(image, results):
    # draws landmarks and connecting lines onto the video feed
    mp_drawing.draw_landmarks(
        image,
        results.right_hand_landmarks,
        mp_holistic.HAND_CONNECTIONS,
        mp_drawing.DrawingSpec(color=(120, 0, 200), thickness=2, circle_radius=4),
        mp_drawing.DrawingSpec(color=(250, 44, 250), thickness=2, circle_radius=4)
    )

def extract_keypoints(results):
    # gets the coordinate information of the landmarks
    hand_points = np.array([[result.x, result.y, result.z] for result in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return hand_points

# list of all recognizable gestures
actions = np.array(['thumbs_up', 'thumbs_down', 'open', 'fist', 'peace', 'point', 'middle_finger', 'rock', 'stop', 'okay', 'call_me', 'none'])

# get model
model = tf.keras.models.load_model('model.h5')
probability_model = tf.keras.Sequential([model, tf.keras.layers.Softmax()])

# record and classify gestures
vid = cv2.VideoCapture(0)
action = 'call_me' # replace with whatever gesture you want to collect
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while vid.isOpened():
        _, image = vid.read()
        image, results = detection(image, holistic)
        draw_landmarks(image, results)
        keypoints = extract_keypoints(results)
        keypoints = np.array([keypoints])
        prediction = probability_model.predict(keypoints)
        image = cv2.flip(image, 1)
        cv2.putText(image, actions[np.argmax(prediction)], (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (250, 44, 250), 2, cv2.LINE_AA)
        cv2.imshow('Hands', image)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
vid.release()
cv2.destroyAllWindows()
# no idea why the hell this is needed, but the capture window won't close otherwise
for i in range(4):
    cv2.waitKey(1)