AgentMove/utils.py at main · lvhy24/AgentMove · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
import os
import re
import glob
import json
import jsmin
import argparse
import json_repair
import numpy as np

import hashlib
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from typing import Dict, Tuple
from datetime import datetime
from math import radians, sin, cos, sqrt, atan2


from config import EXP_CITIES, PROCESSED_DIR
from token_count import TokenCount


def haversine_distance(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    radius = 6371.0
    distance = radius * c
    return distance


def create_dir(dir):
    # if dir does not exist, create it
    if not os.path.exists(dir):
        os.makedirs(dir)


def convert_time(dataset, model, original_time_str):
    # 解析原始时间字符串的格式
    if dataset in['Shanghai']: # for WWW 2019 Shanghai-ISP
        parsed_time = datetime.strptime(original_time_str, "%a %b %d %H:%M:%S %Y")
    else:
        parsed_time = datetime.strptime(original_time_str, "%a %b %d %H:%M:%S %z %Y")
    # 转换为目标格式的字符串
    if model == "GETNext":
        formatted_time_str = parsed_time.strftime("%Y-%m-%d %H:%M:%S")
    elif model == "SNPM":
        formatted_time_str = parsed_time.strftime("%Y-%m-%dT%H:%M:%SZ")
    elif model == "STHM":
        formatted_time_str = parsed_time.strftime("%Y-%m-%dT%H:%M:%S")
    else:
        raise ValueError("Unsupported model type. Supported models are: GETNext, SNPM, STHM.")
    return formatted_time_str


def string_to_md5_hex(s):
    # 创建MD5哈希对象
    hash_object = hashlib.md5()
    # 更新哈希对象，输入需要是bytes类型
    hash_object.update(s.encode('utf-8'))
    # 获取十六进制形式的摘要
    hex_dig = hash_object.hexdigest()
    return hex_dig


def convert_timestamp(dataset, time_str):
    if dataset in['Shanghai']: # for WWW 2019 Shanghai-ISP
        timestamp = datetime.strptime(time_str, "%a %b %d %H:%M:%S %Y")
    else:
        timestamp = datetime.strptime(time_str, "%a %b %d %H:%M:%S %z %Y")
    midnight = timestamp.replace(hour=0, minute=0, second=0)
    total_minutes = (timestamp - midnight).total_seconds() / 60
    total_minutes_in_day = 24 * 60

    fraction = total_minutes / total_minutes_in_day

    return fraction


def replace_original_poi_id(fs):
    fs['temp_id'] = fs.groupby(['Latitude', 'Longitude','PoiCategoryId']).ngroup() + 1

    # 更新 PoiId，使用 temp_id 作为新的 PoiId
    fs['PoiId'] = fs['temp_id']

    # 删除临时列
    fs.drop(columns='temp_id', inplace=True)

    return fs


def id_encode(fit_df: pd.DataFrame, encode_df: pd.DataFrame, column: str, padding: int = -1) -> Tuple[dict, int]:
    id_le = LabelEncoder()
    id_le = id_le.fit(fit_df[column].values.tolist())
    if padding == 0:
        padding_id = padding
        encode_df[column] = [
            id_le.transform([i])[0] + 1 if i in id_le.classes_ else padding_id
            for i in encode_df[column].values.tolist()
        ]
    else:
        padding_id = len(id_le.classes_)
        encode_df[column] = [
            id_le.transform([i])[0] if i in id_le.classes_ else padding_id
            for i in encode_df[column].values.tolist()
        ]
    return id_le, padding_id


def ignore_first(df: pd.DataFrame) -> pd.DataFrame:
    """
    Ignore the first check-in sample of every trajectory because of no historical check-in.
    """
    df['pseudo_session_trajectory_rank'] = df.groupby(
        'pseudo_session_trajectory_id')['UTCTimeOffset'].rank(method='first')
    df['query_pseudo_session_trajectory_id'] = df['pseudo_session_trajectory_id'].shift()
    df.loc[df['pseudo_session_trajectory_rank'] == 1, 'query_pseudo_session_trajectory_id'] = None
    df['last_checkin_epoch_time'] = df['UTCTimeOffsetEpoch'].shift()
    df.loc[df['pseudo_session_trajectory_rank'] == 1, 'last_checkin_epoch_time'] = None
    df.loc[df['UserRank'] == 1, 'SplitTag'] = 'ignore'
    df.loc[df['pseudo_session_trajectory_rank'] == 1, 'SplitTag'] = 'ignore'
    return df


def encode_poi_catid(
        fit_df: pd.DataFrame,
        encode_df: pd.DataFrame,
        source_column: str,
        target_column: str,
        padding: int = -1
) -> Tuple[LabelEncoder, int]:
    """
    将source_column列中的唯一值编码到target_column列，类似于STPM的id_encode函数。
    :param fit_df: 用于构建LabelEncoder的DataFrame
    :param encode_df: 需要编码的DataFrame
    :param source_column: 要编码的源列
    :param target_column: 编码后的目标列
    :param padding: 当值不存在于LabelEncoder中时的填充值
    :return: LabelEncoder实例和填充值padding_id
    """
    # 初始化LabelEncoder并进行fit
    id_le = LabelEncoder()
    id_le = id_le.fit(fit_df[source_column].values.tolist())

    # 如果padding为0，编码值从1开始
    if padding == 0:
        padding_id = padding
        encode_df[target_column] = [
            id_le.transform([i])[0] + 1 if i in id_le.classes_ else padding_id
            for i in encode_df[source_column].values.tolist()
        ]
    else:
        # 如果padding不是0，默认填充值为最大编码值+1
        padding_id = len(id_le.classes_)
        encode_df[target_column] = [
            id_le.transform([i])[0] if i in id_le.classes_ else padding_id
            for i in encode_df[source_column].values.tolist()
        ]

    return id_le, padding_id


def int_to_days(int_day):
    days_of_week = {0: 'Monday',
                    1: 'Tuesday',
                    2: 'Wednesday',
                    3: 'Thursday',
                    4: 'Friday',
                    5: 'Saturday',
                    6: 'Sunday'}
    return days_of_week.get(int_day, "NA")


def list_predicted_users(folder_path):
    # get the names of all the files in the folder
    files = os.listdir(folder_path)
    # filter out only the files that are .json
    files = [f for f in files if f.endswith('.json')]
    # split file names to get the user id (second last _ is the split)
    users = [f.split('_')[-2] for f in files]
    # remove duplicates
    users = list(set(users))
    return users


def match_prediction(text, prediction_key="prediction"):
    if prediction_key=="prediction":
        match = re.search(r'[Pp]rediction(.*?)[Rr]eason', text, re.DOTALL)
    elif prediction_key=="recommendation":
        match = re.search(r'[Rr]ecommendation(.*?)[Rr]eason', text, re.DOTALL)
    else:
        match = re.search(r'[Pp]rediction(.*?)[Rr]eason', text, re.DOTALL)

    # Extract the prediction text between "prediction" and "reason"
    if match:
        prediction_text = match.group(1)
        place_ids = re.findall(r'\b[0-9a-f]{24}\b', prediction_text)
    else:
        place_ids = []
    return place_ids


def token_count(text):
    tc = TokenCount(model_name="gpt-3.5-turbo")
    return tc.num_tokens_from_string(text)


def extract_json(full_text, prediction_key="prediction"):
        # Attempt to load as JSON
        # we can use json_pair to repair invalid JSON https://github.com/mangiucugna/json_repair
        # we can use jsmin to remove comments in JSON https://github.com/tikitu/jsmin/
        if not isinstance(full_text, str):
            output_json = {
                "raw_response": ""
            }
            prediction = ""
            reason = ""
            return output_json, prediction, reason
        json_str = full_text[full_text.find('{'):full_text.rfind('}') + 1]
        if len(json_str)==0:
            json_str = full_text

        # remove potential comments in json_str
        try:
            json_str = jsmin.jsmin(json_str)
        except:
            pass

        try:
            output_json = json.loads(json_str)
            prediction = output_json.get(prediction_key)
            if len(prediction)==0:
                prediction = match_prediction(output_json, prediction_key)
            reason = output_json.get('reason')
        except json.JSONDecodeError:
            # If not JSON, store the raw full_text string in a new dictionary
            prediction = full_text[full_text.find('['):full_text.rfind(']') + 1]
            reason = ""
            if len(prediction) > 0:
                try:
                    prediction = json.loads(prediction)
                    prediction = [int(item) for item in prediction]
                except:
                    prediction = prediction
            else:
                prediction = match_prediction(full_text, prediction_key)
            output_json = {
                "raw_response": full_text,
                "prediction": prediction,
                "reason" : ""
            }
        except Exception as e:
            reason = "Exception:{}".format(e)
            output_json = {
                "raw_response": full_text,
                "prediction": prediction,
                "reason" : reason
            }

        return output_json, prediction, reason


def token_analyis(file_path, inlcude=None):
    # for city in ["NewYork", "Tokyo", "Shanghai"]:
    # file_path = f"results/20240803/{city}/agentmove/*"
    print(file_path)
    file_path = os.path.join(glob.glob(file_path)[0], "*")
    print(file_path)
    if inlcude==None:
        file_path = os.path.join(glob.glob(file_path)[0], "*")
    else:
        for file in glob.glob(file_path):
            if inlcude in file:
                file_path = os.path.join(file, "*")
                break
    print(file_path)
    lens = []
    for file in glob.glob(file_path):
        # print(file)
        with open(file) as fid:
            data = json.load(fid)
            input_text_len = token_count(data["input"])
            lens.append(input_text_len)
    res = (file_path, len(lens), np.percentile(lens, 0.5), np.percentile(lens, 0.9), max(lens), np.sum(lens))
    print(res)


def generate_graphs():
    from models.world_model import SocialWorld
    from processing.data import Dataset
    for city_name in EXP_CITIES:
        print("processing {}".format(city_name))
        dataset = Dataset(
            dataset_name=city_name,
            traj_min_len=3,
            trajectory_mode="trajectory_split",
            historical_stays=16,
            context_stays=6,
            save_dir=PROCESSED_DIR,
            use_int_venue=False,
            )

        social_world = SocialWorld(
            traj_dataset=dataset,
            save_dir=PROCESSED_DIR,
            city_name=city_name,
            khop=1,
            max_neighbors=10
        )


def generate_data():
    from processing.data import Dataset
    for city_name in EXP_CITIES:
        print("processing {}".format(city_name))
        dataset = Dataset(
            dataset_name=city_name,
            traj_min_len=3,
            trajectory_mode="trajectory_split",
            historical_stays=15,
            context_stays=6,
            save_dir=PROCESSED_DIR,
            use_int_venue=False,
            )

if __name__ == "__main__":
    # parser = argparse.ArgumentParser()
    # parser.add_argument('--file_path', type=str, default="")
    # parser.add_argument('--include', type=str, default="")
    # args = parser.parse_args()

    # token_analyis(args.file_path, args.include)

    # generate_graphs()

    generate_data()