diff --git a/examples/BuddyLlama/op.txt b/examples/BuddyLlama/op.txt
deleted file mode 100644
index 24920f4e..00000000
--- a/examples/BuddyLlama/op.txt
+++ /dev/null
@@ -1,2907 +0,0 @@
-0: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18279b80>
-1: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18272090>
-2: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff1830a2d0>
-3: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff1831ce90>
-4: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18249010>
-5: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18254f50>
-6: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff1826e390>
-7: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff1827fec0>
-8: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff1827dd30>
-9: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18453f50>
-10: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18280140>
-11: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18281dc0>
-12: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18281df0>
-13: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18281e20>
-14: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18281e50>
-15: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18281e80>
-16: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18281eb0>
-17: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18281ee0>
-18: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18281f10>
-19: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18281f40>
-20: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18281f70>
-21: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18281fa0>
-22: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18281fd0>
-23: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282000>
-24: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282030>
-25: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282060>
-26: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282090>
-27: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182820c0>
-28: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282120>
-29: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282150>
-30: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282180>
-31: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182821b0>
-32: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182821e0>
-33: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff184f75f0>
-34: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff184f6630>
-35: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18264b90>
-36: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18248680>
-37: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18251790>
-38: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff184f9fd0>
-39: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182452e0>
-40: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18256660>
-41: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182706b0>
-42: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff1825fbf0>
-43: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff1825c0e0>
-44: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff1825f8c0>
-45: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18235d60>
-46: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff1827ffe0>
-47: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff1823fa40>
-48: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18323890>
-49: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18320f80>
-50: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff183074d0>
-51: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282210>
-52: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282240>
-53: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282270>
-54: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182822a0>
-55: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282300>
-56: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282330>
-57: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282360>
-58: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282390>
-59: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182823c0>
-60: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182823f0>
-61: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282420>
-62: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282450>
-63: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282480>
-64: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182824b0>
-65: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182824e0>
-66: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282510>
-67: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282540>
-68: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282570>
-69: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182825d0>
-70: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282600>
-71: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282630>
-72: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282660>
-73: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282690>
-74: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182826c0>
-75: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182826f0>
-76: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282720>
-77: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282750>
-78: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282780>
-79: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182827b0>
-80: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182827e0>
-81: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282810>
-82: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282840>
-83: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282870>
-84: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182828a0>
-85: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182828d0>
-86: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282900>
-87: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282930>
-88: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282960>
-89: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282990>
-90: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182829c0>
-91: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182829f0>
-92: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282a20>
-93: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282a50>
-94: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282a80>
-95: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282ab0>
-96: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282ae0>
-97: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282b10>
-98: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282b40>
-99: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282b70>
-100: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282ba0>
-101: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282bd0>
-102: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282c00>
-103: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282c30>
-104: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282c60>
-105: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282c90>
-106: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282cc0>
-107: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282cf0>
-108: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282d20>
-109: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282d50>
-110: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282d80>
-111: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282db0>
-112: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282de0>
-113: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282e10>
-114: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282e40>
-115: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282e70>
-116: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282ea0>
-117: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282ed0>
-118: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282f00>
-119: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282f30>
-120: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282f60>
-121: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282f90>
-122: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282fc0>
-123: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18282ff0>
-124: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283020>
-125: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283050>
-126: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283080>
-127: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182830b0>
-128: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182830e0>
-129: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283110>
-130: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283140>
-131: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283170>
-132: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182831a0>
-133: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182831d0>
-134: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283200>
-135: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283230>
-136: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283260>
-137: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182832c0>
-138: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182832f0>
-139: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283320>
-140: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283380>
-141: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182833b0>
-142: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182833e0>
-143: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283440>
-144: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283470>
-145: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182834a0>
-146: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283500>
-147: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283530>
-148: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283560>
-149: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283590>
-150: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182835c0>
-151: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182835f0>
-152: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283620>
-153: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283680>
-154: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182836b0>
-155: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182836e0>
-156: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283710>
-157: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283740>
-158: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283770>
-159: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182837a0>
-160: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182837d0>
-161: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283800>
-162: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283860>
-163: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283890>
-164: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182838c0>
-165: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182838f0>
-166: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283950>
-167: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283980>
-168: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182839b0>
-169: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182839e0>
-170: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283a10>
-171: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283a40>
-172: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283a70>
-173: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283aa0>
-174: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283ad0>
-175: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283b30>
-176: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283b60>
-177: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283b90>
-178: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283bc0>
-179: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283bf0>
-180: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283c20>
-181: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283c50>
-182: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283c80>
-183: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283cb0>
-184: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283ce0>
-185: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283d10>
-186: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283d70>
-187: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283da0>
-188: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283dd0>
-189: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283e00>
-190: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283e30>
-191: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283e90>
-192: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283ec0>
-193: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283ef0>
-194: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283f20>
-195: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283f50>
-196: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283f80>
-197: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18283fb0>
-198: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284050>
-199: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284080>
-200: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182840b0>
-201: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182840e0>
-202: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284110>
-203: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284140>
-204: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284170>
-205: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182841a0>
-206: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182841d0>
-207: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284230>
-208: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284260>
-209: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284290>
-210: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182842c0>
-211: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284320>
-212: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284350>
-213: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284380>
-214: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182843b0>
-215: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182843e0>
-216: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284410>
-217: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284440>
-218: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284470>
-219: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182844a0>
-220: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284500>
-221: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284530>
-222: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284560>
-223: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284590>
-224: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182845c0>
-225: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182845f0>
-226: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284620>
-227: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284650>
-228: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284680>
-229: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182846b0>
-230: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182846e0>
-231: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284710>
-232: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284740>
-233: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284770>
-234: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182847a0>
-235: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182847d0>
-236: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284800>
-237: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284830>
-238: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284860>
-239: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284890>
-240: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182848c0>
-241: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182848f0>
-242: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284920>
-243: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284950>
-244: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284980>
-245: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182849b0>
-246: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182849e0>
-247: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284a10>
-248: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284a40>
-249: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284a70>
-250: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284aa0>
-251: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284ad0>
-252: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284b00>
-253: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284b30>
-254: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284b60>
-255: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284b90>
-256: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284bc0>
-257: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284bf0>
-258: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284c20>
-259: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284c50>
-260: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284c80>
-261: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284cb0>
-262: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284ce0>
-263: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284d10>
-264: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284d40>
-265: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284d70>
-266: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284da0>
-267: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284dd0>
-268: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284e00>
-269: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284e30>
-270: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284e60>
-271: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284e90>
-272: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284ec0>
-273: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284ef0>
-274: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284f20>
-275: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284f50>
-276: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284f80>
-277: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284fb0>
-278: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18284fe0>
-279: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18285010>
-280: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18285040>
-281: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18285070>
-282: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182850a0>
-283: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182850d0>
-284: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18285100>
-285: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18285130>
-286: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18285160>
-287: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18285190>
-288: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182851c0>
-289: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff182851f0>
-290: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18285220>
-291: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18285250>
-292: <buddy.compiler.graph.operation.PlaceholderOp object at 0x7eff18285280>
-293: <buddy.compiler.graph.operation.EmbeddingOp object at 0x7eff184fc5f0>
-294: <buddy.compiler.graph.operation.IotaOp object at 0x7eff1822d6d0>
-295: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18252030>
-296: <buddy.compiler.graph.operation.FullOp object at 0x7eff1827a090>
-297: <buddy.compiler.graph.operation.IotaOp object at 0x7eff1823cf80>
-298: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18262510>
-299: <buddy.compiler.graph.operation.IotaOp object at 0x7eff184fa2d0>
-300: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff183200b0>
-301: <buddy.compiler.graph.operation.SubOp object at 0x7eff184fce30>
-302: <buddy.compiler.graph.operation.GeOp object at 0x7eff18271640>
-303: <buddy.compiler.graph.operation.ScalarTensorOp object at 0x7eff182578f0>
-304: <buddy.compiler.graph.operation.WhereOp object at 0x7eff18251070>
-305: <buddy.compiler.graph.operation.IotaOp object at 0x7eff1823cfe0>
-306: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1825c560>
-307: <buddy.compiler.graph.operation.GreaterThanOp object at 0x7eff1824baa0>
-308: <buddy.compiler.graph.operation.MulOp object at 0x7eff1824ac60>
-309: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1831bcb0>
-310: <buddy.compiler.graph.operation.SliceOp object at 0x7eff182562a0>
-311: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff183223f0>
-312: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff184acf80>
-313: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1822d640>
-314: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18252540>
-315: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18270d10>
-316: <buddy.compiler.graph.operation.ConvertElementTypeOp object at 0x7eff18272360>
-317: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff1826cf20>
-318: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1831a210>
-319: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff1831bce0>
-320: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18270a10>
-321: <buddy.compiler.graph.operation.BatchMatmulOp object at 0x7eff18270e90>
-322: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18248cb0>
-323: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1824a570>
-324: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1825dcd0>
-325: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff1830d670>
-326: <buddy.compiler.graph.operation.CloneOp object at 0x7eff1830e270>
-327: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182521e0>
-328: <buddy.compiler.graph.operation.CloneOp object at 0x7eff1823a060>
-329: <buddy.compiler.graph.operation.CosOp object at 0x7eff18278e60>
-330: <buddy.compiler.graph.operation.SinOp object at 0x7eff182723f0>
-331: <buddy.compiler.graph.operation.MulOp object at 0x7eff18784530>
-332: <buddy.compiler.graph.operation.MulOp object at 0x7eff18278e30>
-333: <buddy.compiler.graph.operation.PowOp object at 0x7eff1827a210> // RMSNorm
-334: <buddy.compiler.graph.operation.MeanOp object at 0x7eff18270cb0>
-335: <buddy.compiler.graph.operation.AddOp object at 0x7eff18251580>
-336: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff18310d10>
-337: <buddy.compiler.graph.operation.MulOp object at 0x7eff18321130>
-338: <buddy.compiler.graph.operation.MulOp object at 0x7eff1823acf0>
-339: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff182515b0>
-340: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18253d10>
-341: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1826cf50>
-342: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18252150>
-343: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1826e870>
-344: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182721e0>
-345: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18270170>
-346: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18272330>
-347: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1827bc20>
-348: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1827b8f0>
-349: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1827a3f0>
-350: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1827bb00>
-351: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1827ba70>
-352: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1827a330>
-353: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1827a1b0>
-354: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1827bd70>
-355: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18278cb0>
-356: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1827bd10>
-357: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1827a2a0>
-358: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18278fe0>
-359: <buddy.compiler.graph.operation.MulOp object at 0x7eff18260e60>
-360: <buddy.compiler.graph.operation.SliceOp object at 0x7eff182609b0>
-361: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18260e90>
-362: <buddy.compiler.graph.operation.NegOp object at 0x7eff18262750>
-363: <buddy.compiler.graph.operation.CatOp object at 0x7eff18319760>
-364: <buddy.compiler.graph.operation.MulOp object at 0x7eff18278ef0>
-365: <buddy.compiler.graph.operation.AddOp object at 0x7eff182545f0>
-366: <buddy.compiler.graph.operation.MulOp object at 0x7eff1824a720>
-367: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1824a5a0>
-368: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1825e8d0>
-369: <buddy.compiler.graph.operation.NegOp object at 0x7eff18263a40>
-370: <buddy.compiler.graph.operation.CatOp object at 0x7eff18260e30>
-371: <buddy.compiler.graph.operation.MulOp object at 0x7eff18261910>
-372: <buddy.compiler.graph.operation.AddOp object at 0x7eff18263ad0>
-373: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18263860>
-374: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff182621e0>
-375: <buddy.compiler.graph.operation.SliceOp object at 0x7eff182621b0>
-376: <buddy.compiler.graph.operation.SliceOp object at 0x7eff182600e0>
-377: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff182611c0>
-378: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18261940>
-379: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1827f980>
-380: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1827dc40>
-381: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1827c590>
-382: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff1827e2d0>
-383: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff18279790>
-384: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18278980>
-385: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18272150>
-386: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18262180>
-387: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1827cf20>
-388: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1827e9f0>
-389: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182852b0>
-390: <buddy.compiler.graph.operation.AddOp object at 0x7eff182852e0>
-391: <buddy.compiler.graph.operation.PowOp object at 0x7eff18285310> //
-392: <buddy.compiler.graph.operation.MeanOp object at 0x7eff18285340>
-393: <buddy.compiler.graph.operation.AddOp object at 0x7eff18285370>
-394: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff182853a0>
-395: <buddy.compiler.graph.operation.MulOp object at 0x7eff18285400>
-396: <buddy.compiler.graph.operation.MulOp object at 0x7eff18285430>
-397: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18285460>
-398: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18285490>
-399: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff182854c0>
-400: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182854f0>
-401: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff18285520>
-402: <buddy.compiler.graph.operation.MulOp object at 0x7eff18271100>
-403: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff183057f0>
-404: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18239880>
-405: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18285550>
-406: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18285580>
-407: <buddy.compiler.graph.operation.MulOp object at 0x7eff182855e0>
-408: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18285610>
-409: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18285640>
-410: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18285670>
-411: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182856a0>
-412: <buddy.compiler.graph.operation.AddOp object at 0x7eff182856d0>
-413: <buddy.compiler.graph.operation.PowOp object at 0x7eff18285700>
-414: <buddy.compiler.graph.operation.MeanOp object at 0x7eff18285730>
-415: <buddy.compiler.graph.operation.AddOp object at 0x7eff18285760>
-416: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff18285790>
-417: <buddy.compiler.graph.operation.MulOp object at 0x7eff182857c0>
-418: <buddy.compiler.graph.operation.MulOp object at 0x7eff182857f0>
-419: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18285820>
-420: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18285850>
-421: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff182858b0>
-422: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182858e0>
-423: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18285910>
-424: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18285940>
-425: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18285970>
-426: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182859a0>
-427: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff182859d0>
-428: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18285a00>
-429: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18285a30>
-430: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18285a60>
-431: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18285a90>
-432: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18285ac0>
-433: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18285af0>
-434: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18285b20>
-435: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18285b50>
-436: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18285bb0>
-437: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18285be0>
-438: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18285c10>
-439: <buddy.compiler.graph.operation.MulOp object at 0x7eff18285c40>
-440: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18285c70>
-441: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18285ca0>
-442: <buddy.compiler.graph.operation.NegOp object at 0x7eff18285cd0>
-443: <buddy.compiler.graph.operation.CatOp object at 0x7eff18285d00>
-444: <buddy.compiler.graph.operation.MulOp object at 0x7eff18285d30>
-445: <buddy.compiler.graph.operation.AddOp object at 0x7eff18285d60>
-446: <buddy.compiler.graph.operation.MulOp object at 0x7eff18285d90>
-447: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18285dc0>
-448: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18285df0>
-449: <buddy.compiler.graph.operation.NegOp object at 0x7eff18285e20>
-450: <buddy.compiler.graph.operation.CatOp object at 0x7eff18285e50>
-451: <buddy.compiler.graph.operation.MulOp object at 0x7eff18285e80>
-452: <buddy.compiler.graph.operation.AddOp object at 0x7eff18285eb0>
-453: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18285f10>
-454: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18285f40>
-455: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18285f70>
-456: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18285fa0>
-457: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff18285fd0>
-458: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18286000>
-459: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18286030>
-460: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18286060>
-461: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18286090>
-462: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff182860c0>
-463: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff18286120>
-464: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff182860f0>
-465: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18286150>
-466: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18286180>
-467: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182861b0>
-468: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff182861e0>
-469: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18286210>
-470: <buddy.compiler.graph.operation.AddOp object at 0x7eff18286240>
-471: <buddy.compiler.graph.operation.PowOp object at 0x7eff18286270>
-472: <buddy.compiler.graph.operation.MeanOp object at 0x7eff182862a0>
-473: <buddy.compiler.graph.operation.AddOp object at 0x7eff182862d0>
-474: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff18286330>
-475: <buddy.compiler.graph.operation.MulOp object at 0x7eff18286360>
-476: <buddy.compiler.graph.operation.MulOp object at 0x7eff18286390>
-477: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff182863c0>
-478: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182863f0>
-479: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18286420>
-480: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18286450>
-481: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff18286480>
-482: <buddy.compiler.graph.operation.MulOp object at 0x7eff182864b0>
-483: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18286510>
-484: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18286540>
-485: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18286570>
-486: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182865a0>
-487: <buddy.compiler.graph.operation.MulOp object at 0x7eff182865d0>
-488: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18286600>
-489: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18286630>
-490: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18286660>
-491: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18286690>
-492: <buddy.compiler.graph.operation.AddOp object at 0x7eff182866c0>
-493: <buddy.compiler.graph.operation.PowOp object at 0x7eff182866f0>
-494: <buddy.compiler.graph.operation.MeanOp object at 0x7eff18286720>
-495: <buddy.compiler.graph.operation.AddOp object at 0x7eff18286750>
-496: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff18286780>
-497: <buddy.compiler.graph.operation.MulOp object at 0x7eff182867e0>
-498: <buddy.compiler.graph.operation.MulOp object at 0x7eff18286810>
-499: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18286840>
-500: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18286870>
-501: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff182868a0>
-502: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182868d0>
-503: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18286900>
-504: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18286930>
-505: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18286960>
-506: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18286990>
-507: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff182869c0>
-508: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182869f0>
-509: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18286a20>
-510: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18286a50>
-511: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18286a80>
-512: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18286ab0>
-513: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18286ae0>
-514: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18286b10>
-515: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18286b40>
-516: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18286b70>
-517: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18286ba0>
-518: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18286bd0>
-519: <buddy.compiler.graph.operation.MulOp object at 0x7eff18286c00>
-520: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18286c30>
-521: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18286c60>
-522: <buddy.compiler.graph.operation.NegOp object at 0x7eff18286c90>
-523: <buddy.compiler.graph.operation.CatOp object at 0x7eff18286cc0>
-524: <buddy.compiler.graph.operation.MulOp object at 0x7eff18286cf0>
-525: <buddy.compiler.graph.operation.AddOp object at 0x7eff18286d20>
-526: <buddy.compiler.graph.operation.MulOp object at 0x7eff18286d50>
-527: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18286d80>
-528: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18286db0>
-529: <buddy.compiler.graph.operation.NegOp object at 0x7eff18286de0>
-530: <buddy.compiler.graph.operation.CatOp object at 0x7eff18286e10>
-531: <buddy.compiler.graph.operation.MulOp object at 0x7eff18286e40>
-532: <buddy.compiler.graph.operation.AddOp object at 0x7eff18286e70>
-533: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18286ea0>
-534: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18286ed0>
-535: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18286f00>
-536: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18286f30>
-537: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff18286f60>
-538: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18286f90>
-539: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18286fc0>
-540: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18286ff0>
-541: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18287020>
-542: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff18287050>
-543: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff182870b0>
-544: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18287080>
-545: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182870e0>
-546: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18287110>
-547: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18287140>
-548: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18287170>
-549: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182871a0>
-550: <buddy.compiler.graph.operation.AddOp object at 0x7eff182871d0>
-551: <buddy.compiler.graph.operation.PowOp object at 0x7eff18287200>
-552: <buddy.compiler.graph.operation.MeanOp object at 0x7eff18287230>
-553: <buddy.compiler.graph.operation.AddOp object at 0x7eff18287260>
-554: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff18287290>
-555: <buddy.compiler.graph.operation.MulOp object at 0x7eff182872c0>
-556: <buddy.compiler.graph.operation.MulOp object at 0x7eff182872f0>
-557: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18287320>
-558: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18287350>
-559: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18287380>
-560: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182873b0>
-561: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff182873e0>
-562: <buddy.compiler.graph.operation.MulOp object at 0x7eff18287410>
-563: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18287440>
-564: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18287470>
-565: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff182874d0>
-566: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18287500>
-567: <buddy.compiler.graph.operation.MulOp object at 0x7eff18287530>
-568: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18287560>
-569: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18287590>
-570: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff182875f0>
-571: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18287620>
-572: <buddy.compiler.graph.operation.AddOp object at 0x7eff18287650>
-573: <buddy.compiler.graph.operation.PowOp object at 0x7eff18287680>
-574: <buddy.compiler.graph.operation.MeanOp object at 0x7eff182876b0>
-575: <buddy.compiler.graph.operation.AddOp object at 0x7eff18287710>
-576: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff18287740>
-577: <buddy.compiler.graph.operation.MulOp object at 0x7eff18287770>
-578: <buddy.compiler.graph.operation.MulOp object at 0x7eff182877d0>
-579: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18287800>
-580: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18287830>
-581: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18287860>
-582: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18287890>
-583: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff182878c0>
-584: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182878f0>
-585: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18287950>
-586: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18287980>
-587: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff182879b0>
-588: <buddy.compiler.graph.operation.ViewOp object at 0x7eff182879e0>
-589: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18287a10>
-590: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18287a40>
-591: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18287a70>
-592: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18287aa0>
-593: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18287ad0>
-594: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18287b30>
-595: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18287b60>
-596: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff18287b90>
-597: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18287bc0>
-598: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18287c20>
-599: <buddy.compiler.graph.operation.MulOp object at 0x7eff18287c50>
-600: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18287c80>
-601: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18287cb0>
-602: <buddy.compiler.graph.operation.NegOp object at 0x7eff18287ce0>
-603: <buddy.compiler.graph.operation.CatOp object at 0x7eff18287d10>
-604: <buddy.compiler.graph.operation.MulOp object at 0x7eff18287d40>
-605: <buddy.compiler.graph.operation.AddOp object at 0x7eff18287d70>
-606: <buddy.compiler.graph.operation.MulOp object at 0x7eff18287da0>
-607: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18287e00>
-608: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18287e30>
-609: <buddy.compiler.graph.operation.NegOp object at 0x7eff18287e60>
-610: <buddy.compiler.graph.operation.CatOp object at 0x7eff18287e90>
-611: <buddy.compiler.graph.operation.MulOp object at 0x7eff18287ec0>
-612: <buddy.compiler.graph.operation.AddOp object at 0x7eff18287ef0>
-613: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18287f20>
-614: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff18287f50>
-615: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18287f80>
-616: <buddy.compiler.graph.operation.SliceOp object at 0x7eff18287fb0>
-617: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff18287fe0>
-618: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828c080>
-619: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828c0b0>
-620: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828c0e0>
-621: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828c110>
-622: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff1828c140>
-623: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff1828c1d0>
-624: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828c1a0>
-625: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828c200>
-626: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828c230>
-627: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828c260>
-628: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828c290>
-629: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828c2c0>
-630: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828c320>
-631: <buddy.compiler.graph.operation.PowOp object at 0x7eff1828c350>
-632: <buddy.compiler.graph.operation.MeanOp object at 0x7eff1828c380>
-633: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828c3b0>
-634: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff1828c3e0>
-635: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828c410>
-636: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828c440>
-637: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828c470>
-638: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828c4a0>
-639: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828c500>
-640: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828c530>
-641: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff1828c560>
-642: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828c590>
-643: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828c5f0>
-644: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828c620>
-645: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828c650>
-646: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828c680>
-647: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828c6b0>
-648: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828c6e0>
-649: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828c710>
-650: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828c740>
-651: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828c770>
-652: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828c7d0>
-653: <buddy.compiler.graph.operation.PowOp object at 0x7eff1828c800>
-654: <buddy.compiler.graph.operation.MeanOp object at 0x7eff1828c830>
-655: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828c860>
-656: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff1828c890>
-657: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828c8c0>
-658: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828c8f0>
-659: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828c920>
-660: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828c950>
-661: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828c980>
-662: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828c9b0>
-663: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828c9e0>
-664: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828ca10>
-665: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828ca40>
-666: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828ca70>
-667: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828caa0>
-668: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828cad0>
-669: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828cb00>
-670: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828cb30>
-671: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828cb60>
-672: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828cb90>
-673: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828cbc0>
-674: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828cbf0>
-675: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828cc20>
-676: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828cc50>
-677: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828cc80>
-678: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828ccb0>
-679: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828cce0>
-680: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828cd10>
-681: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828cd40>
-682: <buddy.compiler.graph.operation.NegOp object at 0x7eff1828cd70>
-683: <buddy.compiler.graph.operation.CatOp object at 0x7eff1828cda0>
-684: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828cdd0>
-685: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828ce00>
-686: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828ce30>
-687: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828ce60>
-688: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828ce90>
-689: <buddy.compiler.graph.operation.NegOp object at 0x7eff1828cec0>
-690: <buddy.compiler.graph.operation.CatOp object at 0x7eff1828cef0>
-691: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828cf20>
-692: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828cf50>
-693: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828cf80>
-694: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828cfb0>
-695: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828cfe0>
-696: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828d010>
-697: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff1828d040>
-698: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828d070>
-699: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828d0a0>
-700: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828d0d0>
-701: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828d100>
-702: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff1828d130>
-703: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff1828d190>
-704: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828d160>
-705: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828d1c0>
-706: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828d1f0>
-707: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828d220>
-708: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828d250>
-709: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828d280>
-710: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828d2b0>
-711: <buddy.compiler.graph.operation.PowOp object at 0x7eff1828d2e0>
-712: <buddy.compiler.graph.operation.MeanOp object at 0x7eff1828d310>
-713: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828d340>
-714: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff1828d370>
-715: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828d3a0>
-716: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828d3d0>
-717: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828d400>
-718: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828d430>
-719: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828d460>
-720: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828d490>
-721: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff1828d4c0>
-722: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828d4f0>
-723: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828d520>
-724: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828d550>
-725: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828d580>
-726: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828d5b0>
-727: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828d5e0>
-728: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828d610>
-729: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828d640>
-730: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828d670>
-731: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828d6a0>
-732: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828d6d0>
-733: <buddy.compiler.graph.operation.PowOp object at 0x7eff1828d730>
-734: <buddy.compiler.graph.operation.MeanOp object at 0x7eff1828d760>
-735: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828d790>
-736: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff1828d7c0>
-737: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828d7f0>
-738: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828d820>
-739: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828d850>
-740: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828d880>
-741: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828d8b0>
-742: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828d910>
-743: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828d940>
-744: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828d970>
-745: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828d9a0>
-746: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828d9d0>
-747: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828da00>
-748: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828da30>
-749: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828da60>
-750: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828da90>
-751: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828dac0>
-752: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828daf0>
-753: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828db20>
-754: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828db50>
-755: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828db80>
-756: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828dbe0>
-757: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828dc10>
-758: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828dc40>
-759: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828dc70>
-760: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828dca0>
-761: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828dcd0>
-762: <buddy.compiler.graph.operation.NegOp object at 0x7eff1828dd00>
-763: <buddy.compiler.graph.operation.CatOp object at 0x7eff1828dd30>
-764: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828dd60>
-765: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828dd90>
-766: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828ddc0>
-767: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828ddf0>
-768: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828de20>
-769: <buddy.compiler.graph.operation.NegOp object at 0x7eff1828de50>
-770: <buddy.compiler.graph.operation.CatOp object at 0x7eff1828de80>
-771: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828dee0>
-772: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828df10>
-773: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828df40>
-774: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828df70>
-775: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828dfa0>
-776: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828dfd0>
-777: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff1828e000>
-778: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828e030>
-779: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828e060>
-780: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828e090>
-781: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828e0c0>
-782: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff1828e0f0>
-783: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff1828e150>
-784: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828e120>
-785: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828e180>
-786: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828e1b0>
-787: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828e1e0>
-788: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828e240>
-789: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828e270>
-790: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828e2a0>
-791: <buddy.compiler.graph.operation.PowOp object at 0x7eff1828e2d0>
-792: <buddy.compiler.graph.operation.MeanOp object at 0x7eff1828e300>
-793: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828e330>
-794: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff1828e360>
-795: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828e390>
-796: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828e3c0>
-797: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828e3f0>
-798: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828e420>
-799: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828e450>
-800: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828e480>
-801: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff1828e4b0>
-802: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828e4e0>
-803: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828e510>
-804: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828e540>
-805: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828e570>
-806: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828e5a0>
-807: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828e5d0>
-808: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828e600>
-809: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828e630>
-810: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828e690>
-811: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828e6c0>
-812: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828e6f0>
-813: <buddy.compiler.graph.operation.PowOp object at 0x7eff1828e720>
-814: <buddy.compiler.graph.operation.MeanOp object at 0x7eff1828e750>
-815: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828e780>
-816: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff1828e7b0>
-817: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828e7e0>
-818: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828e810>
-819: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828e870>
-820: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828e8a0>
-821: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828e8d0>
-822: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828e900>
-823: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828e930>
-824: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828e960>
-825: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828e990>
-826: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828e9c0>
-827: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828e9f0>
-828: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828ea20>
-829: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828ea50>
-830: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828ea80>
-831: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828eab0>
-832: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828eae0>
-833: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828eb40>
-834: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828eb70>
-835: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828eba0>
-836: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828ebd0>
-837: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828ec00>
-838: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828ec30>
-839: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828ec60>
-840: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828ec90>
-841: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828ecc0>
-842: <buddy.compiler.graph.operation.NegOp object at 0x7eff1828ecf0>
-843: <buddy.compiler.graph.operation.CatOp object at 0x7eff1828ed20>
-844: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828ed50>
-845: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828ed80>
-846: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828edb0>
-847: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828ede0>
-848: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828ee10>
-849: <buddy.compiler.graph.operation.NegOp object at 0x7eff1828ee40>
-850: <buddy.compiler.graph.operation.CatOp object at 0x7eff1828ee70>
-851: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828eea0>
-852: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828eed0>
-853: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828ef00>
-854: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828ef30>
-855: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828ef60>
-856: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828ef90>
-857: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff1828efc0>
-858: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828eff0>
-859: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828f020>
-860: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828f050>
-861: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828f080>
-862: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff1828f0b0>
-863: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff1828f110>
-864: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828f0e0>
-865: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f140>
-866: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828f170>
-867: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f1a0>
-868: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828f1d0>
-869: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f200>
-870: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828f230>
-871: <buddy.compiler.graph.operation.PowOp object at 0x7eff1828f260>
-872: <buddy.compiler.graph.operation.MeanOp object at 0x7eff1828f290>
-873: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828f2c0>
-874: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff1828f2f0>
-875: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828f320>
-876: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828f350>
-877: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828f380>
-878: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f3b0>
-879: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828f3e0>
-880: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f410>
-881: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff1828f440>
-882: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828f470>
-883: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828f4a0>
-884: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f4d0>
-885: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828f500>
-886: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f530>
-887: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828f560>
-888: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828f590>
-889: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f5c0>
-890: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828f5f0>
-891: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f620>
-892: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828f650>
-893: <buddy.compiler.graph.operation.PowOp object at 0x7eff1828f680>
-894: <buddy.compiler.graph.operation.MeanOp object at 0x7eff1828f6b0>
-895: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828f6e0>
-896: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff1828f710>
-897: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828f740>
-898: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828f770>
-899: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828f7a0>
-900: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f7d0>
-901: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828f800>
-902: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f830>
-903: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828f860>
-904: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f890>
-905: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828f8c0>
-906: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f8f0>
-907: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828f920>
-908: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f950>
-909: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1828f980>
-910: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f9b0>
-911: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828f9e0>
-912: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828fa10>
-913: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828fa40>
-914: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828fa70>
-915: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1828faa0>
-916: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828fad0>
-917: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828fb00>
-918: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828fb30>
-919: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828fb60>
-920: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828fb90>
-921: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828fbc0>
-922: <buddy.compiler.graph.operation.NegOp object at 0x7eff1828fbf0>
-923: <buddy.compiler.graph.operation.CatOp object at 0x7eff1828fc20>
-924: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828fc50>
-925: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828fc80>
-926: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828fcb0>
-927: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828fce0>
-928: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828fd10>
-929: <buddy.compiler.graph.operation.NegOp object at 0x7eff1828fd40>
-930: <buddy.compiler.graph.operation.CatOp object at 0x7eff1828fd70>
-931: <buddy.compiler.graph.operation.MulOp object at 0x7eff1828fda0>
-932: <buddy.compiler.graph.operation.AddOp object at 0x7eff1828fdd0>
-933: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828fe00>
-934: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1828fe30>
-935: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828fe60>
-936: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828fe90>
-937: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff1828fec0>
-938: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828fef0>
-939: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828ff20>
-940: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828ff50>
-941: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1828ff80>
-942: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff1828ffb0>
-943: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff17288050>
-944: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1828ffe0>
-945: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288080>
-946: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172880b0>
-947: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172880e0>
-948: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17288110>
-949: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288140>
-950: <buddy.compiler.graph.operation.AddOp object at 0x7eff17288170>
-951: <buddy.compiler.graph.operation.PowOp object at 0x7eff172881a0>
-952: <buddy.compiler.graph.operation.MeanOp object at 0x7eff172881d0>
-953: <buddy.compiler.graph.operation.AddOp object at 0x7eff17288200>
-954: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17288230>
-955: <buddy.compiler.graph.operation.MulOp object at 0x7eff17288260>
-956: <buddy.compiler.graph.operation.MulOp object at 0x7eff17288290>
-957: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172882c0>
-958: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172882f0>
-959: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17288320>
-960: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288350>
-961: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17288380>
-962: <buddy.compiler.graph.operation.MulOp object at 0x7eff172883b0>
-963: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172883e0>
-964: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288410>
-965: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17288440>
-966: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288470>
-967: <buddy.compiler.graph.operation.MulOp object at 0x7eff172884a0>
-968: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172884d0>
-969: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288500>
-970: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17288530>
-971: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288560>
-972: <buddy.compiler.graph.operation.AddOp object at 0x7eff17288590>
-973: <buddy.compiler.graph.operation.PowOp object at 0x7eff172885c0>
-974: <buddy.compiler.graph.operation.MeanOp object at 0x7eff172885f0>
-975: <buddy.compiler.graph.operation.AddOp object at 0x7eff17288620>
-976: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17288650>
-977: <buddy.compiler.graph.operation.MulOp object at 0x7eff17288680>
-978: <buddy.compiler.graph.operation.MulOp object at 0x7eff172886b0>
-979: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172886e0>
-980: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288710>
-981: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17288740>
-982: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288770>
-983: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172887a0>
-984: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172887d0>
-985: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17288800>
-986: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288830>
-987: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17288860>
-988: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288890>
-989: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172888c0>
-990: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172888f0>
-991: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288920>
-992: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17288950>
-993: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288980>
-994: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172889b0>
-995: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172889e0>
-996: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17288a10>
-997: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17288a40>
-998: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17288a70>
-999: <buddy.compiler.graph.operation.MulOp object at 0x7eff17288aa0>
-1000: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17288ad0>
-1001: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17288b00>
-1002: <buddy.compiler.graph.operation.NegOp object at 0x7eff17288b30>
-1003: <buddy.compiler.graph.operation.CatOp object at 0x7eff17288b60>
-1004: <buddy.compiler.graph.operation.MulOp object at 0x7eff17288b90>
-1005: <buddy.compiler.graph.operation.AddOp object at 0x7eff17288bc0>
-1006: <buddy.compiler.graph.operation.MulOp object at 0x7eff17288bf0>
-1007: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17288c20>
-1008: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17288c50>
-1009: <buddy.compiler.graph.operation.NegOp object at 0x7eff17288c80>
-1010: <buddy.compiler.graph.operation.CatOp object at 0x7eff17288cb0>
-1011: <buddy.compiler.graph.operation.MulOp object at 0x7eff17288ce0>
-1012: <buddy.compiler.graph.operation.AddOp object at 0x7eff17288d10>
-1013: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17288d40>
-1014: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17288d70>
-1015: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17288da0>
-1016: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17288dd0>
-1017: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff17288e00>
-1018: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17288e30>
-1019: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17288e60>
-1020: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17288e90>
-1021: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17288ec0>
-1022: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff17288ef0>
-1023: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff17288f50>
-1024: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17288f20>
-1025: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288f80>
-1026: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17288fb0>
-1027: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17288fe0>
-1028: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17289010>
-1029: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289040>
-1030: <buddy.compiler.graph.operation.AddOp object at 0x7eff17289070>
-1031: <buddy.compiler.graph.operation.PowOp object at 0x7eff172890a0>
-1032: <buddy.compiler.graph.operation.MeanOp object at 0x7eff172890d0>
-1033: <buddy.compiler.graph.operation.AddOp object at 0x7eff17289100>
-1034: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17289130>
-1035: <buddy.compiler.graph.operation.MulOp object at 0x7eff17289160>
-1036: <buddy.compiler.graph.operation.MulOp object at 0x7eff17289190>
-1037: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172891c0>
-1038: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172891f0>
-1039: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17289220>
-1040: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289250>
-1041: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17289280>
-1042: <buddy.compiler.graph.operation.MulOp object at 0x7eff172892b0>
-1043: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172892e0>
-1044: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289310>
-1045: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17289340>
-1046: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289370>
-1047: <buddy.compiler.graph.operation.MulOp object at 0x7eff172893a0>
-1048: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172893d0>
-1049: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289400>
-1050: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17289430>
-1051: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289460>
-1052: <buddy.compiler.graph.operation.AddOp object at 0x7eff17289490>
-1053: <buddy.compiler.graph.operation.PowOp object at 0x7eff172894c0>
-1054: <buddy.compiler.graph.operation.MeanOp object at 0x7eff172894f0>
-1055: <buddy.compiler.graph.operation.AddOp object at 0x7eff17289520>
-1056: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17289550>
-1057: <buddy.compiler.graph.operation.MulOp object at 0x7eff17289580>
-1058: <buddy.compiler.graph.operation.MulOp object at 0x7eff172895b0>
-1059: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172895e0>
-1060: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289610>
-1061: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17289640>
-1062: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289670>
-1063: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172896a0>
-1064: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172896d0>
-1065: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17289700>
-1066: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289730>
-1067: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17289760>
-1068: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289790>
-1069: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172897c0>
-1070: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172897f0>
-1071: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289820>
-1072: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17289850>
-1073: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289880>
-1074: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172898b0>
-1075: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172898e0>
-1076: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17289910>
-1077: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17289940>
-1078: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17289970>
-1079: <buddy.compiler.graph.operation.MulOp object at 0x7eff172899a0>
-1080: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172899d0>
-1081: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17289a00>
-1082: <buddy.compiler.graph.operation.NegOp object at 0x7eff17289a30>
-1083: <buddy.compiler.graph.operation.CatOp object at 0x7eff17289a60>
-1084: <buddy.compiler.graph.operation.MulOp object at 0x7eff17289a90>
-1085: <buddy.compiler.graph.operation.AddOp object at 0x7eff17289ac0>
-1086: <buddy.compiler.graph.operation.MulOp object at 0x7eff17289af0>
-1087: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17289b20>
-1088: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17289b50>
-1089: <buddy.compiler.graph.operation.NegOp object at 0x7eff17289b80>
-1090: <buddy.compiler.graph.operation.CatOp object at 0x7eff17289bb0>
-1091: <buddy.compiler.graph.operation.MulOp object at 0x7eff17289be0>
-1092: <buddy.compiler.graph.operation.AddOp object at 0x7eff17289c10>
-1093: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17289c40>
-1094: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17289c70>
-1095: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17289ca0>
-1096: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17289cd0>
-1097: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff17289d00>
-1098: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17289d30>
-1099: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17289d60>
-1100: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17289d90>
-1101: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17289dc0>
-1102: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff17289df0>
-1103: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff17289e50>
-1104: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17289e20>
-1105: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289e80>
-1106: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17289eb0>
-1107: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289ee0>
-1108: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17289f10>
-1109: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17289f40>
-1110: <buddy.compiler.graph.operation.AddOp object at 0x7eff17289f70>
-1111: <buddy.compiler.graph.operation.PowOp object at 0x7eff17289fa0>
-1112: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17289fd0>
-1113: <buddy.compiler.graph.operation.AddOp object at 0x7eff1728a000>
-1114: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff1728a030>
-1115: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728a060>
-1116: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728a090>
-1117: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728a0c0>
-1118: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a0f0>
-1119: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728a120>
-1120: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a150>
-1121: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff1728a180>
-1122: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728a1b0>
-1123: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728a1e0>
-1124: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a210>
-1125: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728a240>
-1126: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a270>
-1127: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728a2a0>
-1128: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728a2d0>
-1129: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a300>
-1130: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728a330>
-1131: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a360>
-1132: <buddy.compiler.graph.operation.AddOp object at 0x7eff1728a390>
-1133: <buddy.compiler.graph.operation.PowOp object at 0x7eff1728a3c0>
-1134: <buddy.compiler.graph.operation.MeanOp object at 0x7eff1728a3f0>
-1135: <buddy.compiler.graph.operation.AddOp object at 0x7eff1728a420>
-1136: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff1728a450>
-1137: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728a480>
-1138: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728a4b0>
-1139: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728a4e0>
-1140: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a510>
-1141: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728a540>
-1142: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a570>
-1143: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728a5a0>
-1144: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a5d0>
-1145: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728a600>
-1146: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a630>
-1147: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728a660>
-1148: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a690>
-1149: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728a6c0>
-1150: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a6f0>
-1151: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a720>
-1152: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728a750>
-1153: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a780>
-1154: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728a7b0>
-1155: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728a7e0>
-1156: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728a810>
-1157: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1728a840>
-1158: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1728a870>
-1159: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728a8a0>
-1160: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728a8d0>
-1161: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728a900>
-1162: <buddy.compiler.graph.operation.NegOp object at 0x7eff1728a930>
-1163: <buddy.compiler.graph.operation.CatOp object at 0x7eff1728a960>
-1164: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728a990>
-1165: <buddy.compiler.graph.operation.AddOp object at 0x7eff1728a9c0>
-1166: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728a9f0>
-1167: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728aa20>
-1168: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728aa50>
-1169: <buddy.compiler.graph.operation.NegOp object at 0x7eff1728aa80>
-1170: <buddy.compiler.graph.operation.CatOp object at 0x7eff1728aab0>
-1171: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728aae0>
-1172: <buddy.compiler.graph.operation.AddOp object at 0x7eff1728ab10>
-1173: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1728ab40>
-1174: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1728ab70>
-1175: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728aba0>
-1176: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728abd0>
-1177: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff1728ac00>
-1178: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728ac30>
-1179: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728ac60>
-1180: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728ac90>
-1181: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728acc0>
-1182: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff1728acf0>
-1183: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff1728ad50>
-1184: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728ad20>
-1185: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728ad80>
-1186: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728adb0>
-1187: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728ade0>
-1188: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728ae10>
-1189: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728ae40>
-1190: <buddy.compiler.graph.operation.AddOp object at 0x7eff1728ae70>
-1191: <buddy.compiler.graph.operation.PowOp object at 0x7eff1728aea0>
-1192: <buddy.compiler.graph.operation.MeanOp object at 0x7eff1728aed0>
-1193: <buddy.compiler.graph.operation.AddOp object at 0x7eff1728af00>
-1194: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff1728af30>
-1195: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728af60>
-1196: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728af90>
-1197: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728afc0>
-1198: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728aff0>
-1199: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728b020>
-1200: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728b050>
-1201: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff1728b080>
-1202: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728b0b0>
-1203: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728b0e0>
-1204: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728b110>
-1205: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728b140>
-1206: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728b170>
-1207: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728b1a0>
-1208: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728b1d0>
-1209: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728b200>
-1210: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728b230>
-1211: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728b260>
-1212: <buddy.compiler.graph.operation.AddOp object at 0x7eff1728b290>
-1213: <buddy.compiler.graph.operation.PowOp object at 0x7eff1728b2c0>
-1214: <buddy.compiler.graph.operation.MeanOp object at 0x7eff1728b2f0>
-1215: <buddy.compiler.graph.operation.AddOp object at 0x7eff1728b320>
-1216: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff1728b350>
-1217: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728b380>
-1218: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728b3b0>
-1219: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728b3e0>
-1220: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728b410>
-1221: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728b440>
-1222: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728b470>
-1223: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728b4a0>
-1224: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728b4d0>
-1225: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728b500>
-1226: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728b530>
-1227: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728b560>
-1228: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728b590>
-1229: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728b5c0>
-1230: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728b5f0>
-1231: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728b620>
-1232: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728b650>
-1233: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728b680>
-1234: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728b6b0>
-1235: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728b6e0>
-1236: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728b710>
-1237: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1728b740>
-1238: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1728b770>
-1239: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728b7a0>
-1240: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728b7d0>
-1241: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728b800>
-1242: <buddy.compiler.graph.operation.NegOp object at 0x7eff1728b830>
-1243: <buddy.compiler.graph.operation.CatOp object at 0x7eff1728b860>
-1244: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728b890>
-1245: <buddy.compiler.graph.operation.AddOp object at 0x7eff1728b8c0>
-1246: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728b8f0>
-1247: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728b920>
-1248: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728b950>
-1249: <buddy.compiler.graph.operation.NegOp object at 0x7eff1728b980>
-1250: <buddy.compiler.graph.operation.CatOp object at 0x7eff1728b9b0>
-1251: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728b9e0>
-1252: <buddy.compiler.graph.operation.AddOp object at 0x7eff1728ba10>
-1253: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1728ba40>
-1254: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff1728ba70>
-1255: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728baa0>
-1256: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728bad0>
-1257: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff1728bb00>
-1258: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728bb30>
-1259: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728bb60>
-1260: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728bb90>
-1261: <buddy.compiler.graph.operation.SliceOp object at 0x7eff1728bbc0>
-1262: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff1728bbf0>
-1263: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff1728bc50>
-1264: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728bc20>
-1265: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728bc80>
-1266: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728bcb0>
-1267: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728bce0>
-1268: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728bd10>
-1269: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728bd40>
-1270: <buddy.compiler.graph.operation.AddOp object at 0x7eff1728bd70>
-1271: <buddy.compiler.graph.operation.PowOp object at 0x7eff1728bda0>
-1272: <buddy.compiler.graph.operation.MeanOp object at 0x7eff1728bdd0>
-1273: <buddy.compiler.graph.operation.AddOp object at 0x7eff1728be00>
-1274: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff1728be30>
-1275: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728be60>
-1276: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728be90>
-1277: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728bec0>
-1278: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728bef0>
-1279: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff1728bf20>
-1280: <buddy.compiler.graph.operation.ViewOp object at 0x7eff1728bf50>
-1281: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff1728bf80>
-1282: <buddy.compiler.graph.operation.MulOp object at 0x7eff1728bfb0>
-1283: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1728bfe0>
-1284: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cc050>
-1285: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cc080>
-1286: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cc0b0>
-1287: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cc0e0>
-1288: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cc110>
-1289: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cc140>
-1290: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cc170>
-1291: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cc1a0>
-1292: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cc1d0>
-1293: <buddy.compiler.graph.operation.PowOp object at 0x7eff172cc200>
-1294: <buddy.compiler.graph.operation.MeanOp object at 0x7eff172cc230>
-1295: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cc260>
-1296: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff172cc290>
-1297: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cc2c0>
-1298: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cc2f0>
-1299: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cc320>
-1300: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cc350>
-1301: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cc380>
-1302: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cc3b0>
-1303: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cc3e0>
-1304: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cc410>
-1305: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cc440>
-1306: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cc470>
-1307: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cc4a0>
-1308: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cc4d0>
-1309: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cc500>
-1310: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cc530>
-1311: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cc560>
-1312: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cc590>
-1313: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cc5c0>
-1314: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cc5f0>
-1315: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cc620>
-1316: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cc650>
-1317: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172cc680>
-1318: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172cc6b0>
-1319: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cc6e0>
-1320: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cc710>
-1321: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cc740>
-1322: <buddy.compiler.graph.operation.NegOp object at 0x7eff172cc770>
-1323: <buddy.compiler.graph.operation.CatOp object at 0x7eff172cc7a0>
-1324: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cc7d0>
-1325: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cc800>
-1326: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cc830>
-1327: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cc860>
-1328: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cc890>
-1329: <buddy.compiler.graph.operation.NegOp object at 0x7eff172cc8c0>
-1330: <buddy.compiler.graph.operation.CatOp object at 0x7eff172cc8f0>
-1331: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cc920>
-1332: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cc950>
-1333: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172cc980>
-1334: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172cc9b0>
-1335: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cc9e0>
-1336: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cca10>
-1337: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff172cca40>
-1338: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cca70>
-1339: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172ccaa0>
-1340: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172ccad0>
-1341: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172ccb00>
-1342: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff172ccb30>
-1343: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff172ccb90>
-1344: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172ccb60>
-1345: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ccbc0>
-1346: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172ccbf0>
-1347: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ccc20>
-1348: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172ccc50>
-1349: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ccc80>
-1350: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cccb0>
-1351: <buddy.compiler.graph.operation.PowOp object at 0x7eff172ccce0>
-1352: <buddy.compiler.graph.operation.MeanOp object at 0x7eff172ccd10>
-1353: <buddy.compiler.graph.operation.AddOp object at 0x7eff172ccd40>
-1354: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff172ccd70>
-1355: <buddy.compiler.graph.operation.MulOp object at 0x7eff172ccda0>
-1356: <buddy.compiler.graph.operation.MulOp object at 0x7eff172ccdd0>
-1357: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cce00>
-1358: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cce30>
-1359: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cce60>
-1360: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cce90>
-1361: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff172ccec0>
-1362: <buddy.compiler.graph.operation.MulOp object at 0x7eff172ccef0>
-1363: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172ccf20>
-1364: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ccf50>
-1365: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172ccf80>
-1366: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ccfb0>
-1367: <buddy.compiler.graph.operation.MulOp object at 0x7eff172ccfe0>
-1368: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cd010>
-1369: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cd040>
-1370: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cd070>
-1371: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cd0a0>
-1372: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cd0d0>
-1373: <buddy.compiler.graph.operation.PowOp object at 0x7eff172cd100>
-1374: <buddy.compiler.graph.operation.MeanOp object at 0x7eff172cd130>
-1375: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cd160>
-1376: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff172cd190>
-1377: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cd1c0>
-1378: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cd1f0>
-1379: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cd220>
-1380: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cd250>
-1381: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cd280>
-1382: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cd2b0>
-1383: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cd2e0>
-1384: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cd310>
-1385: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cd340>
-1386: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cd370>
-1387: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cd3a0>
-1388: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cd3d0>
-1389: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cd400>
-1390: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cd430>
-1391: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cd460>
-1392: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cd490>
-1393: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cd4c0>
-1394: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cd4f0>
-1395: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cd520>
-1396: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cd550>
-1397: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172cd580>
-1398: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172cd5b0>
-1399: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cd5e0>
-1400: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cd610>
-1401: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cd640>
-1402: <buddy.compiler.graph.operation.NegOp object at 0x7eff172cd670>
-1403: <buddy.compiler.graph.operation.CatOp object at 0x7eff172cd6a0>
-1404: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cd6d0>
-1405: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cd700>
-1406: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cd730>
-1407: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cd760>
-1408: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cd790>
-1409: <buddy.compiler.graph.operation.NegOp object at 0x7eff172cd7c0>
-1410: <buddy.compiler.graph.operation.CatOp object at 0x7eff172cd7f0>
-1411: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cd820>
-1412: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cd850>
-1413: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172cd880>
-1414: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172cd8b0>
-1415: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cd8e0>
-1416: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cd910>
-1417: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff172cd940>
-1418: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cd970>
-1419: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cd9a0>
-1420: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cd9d0>
-1421: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cda00>
-1422: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff172cda30>
-1423: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff172cda90>
-1424: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cda60>
-1425: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cdac0>
-1426: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cdaf0>
-1427: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cdb20>
-1428: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cdb50>
-1429: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cdb80>
-1430: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cdbb0>
-1431: <buddy.compiler.graph.operation.PowOp object at 0x7eff172cdbe0>
-1432: <buddy.compiler.graph.operation.MeanOp object at 0x7eff172cdc10>
-1433: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cdc40>
-1434: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff172cdc70>
-1435: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cdca0>
-1436: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cdcd0>
-1437: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cdd00>
-1438: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cdd30>
-1439: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cdd60>
-1440: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cdd90>
-1441: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff172cddc0>
-1442: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cddf0>
-1443: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cde20>
-1444: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cde50>
-1445: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cde80>
-1446: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cdeb0>
-1447: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cdee0>
-1448: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cdf10>
-1449: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cdf40>
-1450: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cdf70>
-1451: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cdfa0>
-1452: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cdfd0>
-1453: <buddy.compiler.graph.operation.PowOp object at 0x7eff172ce000>
-1454: <buddy.compiler.graph.operation.MeanOp object at 0x7eff172ce030>
-1455: <buddy.compiler.graph.operation.AddOp object at 0x7eff172ce060>
-1456: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff172ce090>
-1457: <buddy.compiler.graph.operation.MulOp object at 0x7eff172ce0c0>
-1458: <buddy.compiler.graph.operation.MulOp object at 0x7eff172ce0f0>
-1459: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172ce120>
-1460: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ce150>
-1461: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172ce180>
-1462: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ce1b0>
-1463: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172ce1e0>
-1464: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ce210>
-1465: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172ce240>
-1466: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ce270>
-1467: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172ce2a0>
-1468: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ce2d0>
-1469: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172ce300>
-1470: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ce330>
-1471: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ce360>
-1472: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172ce390>
-1473: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ce3c0>
-1474: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172ce3f0>
-1475: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ce420>
-1476: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172ce450>
-1477: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172ce480>
-1478: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172ce4b0>
-1479: <buddy.compiler.graph.operation.MulOp object at 0x7eff172ce4e0>
-1480: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172ce510>
-1481: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172ce540>
-1482: <buddy.compiler.graph.operation.NegOp object at 0x7eff172ce570>
-1483: <buddy.compiler.graph.operation.CatOp object at 0x7eff172ce5a0>
-1484: <buddy.compiler.graph.operation.MulOp object at 0x7eff172ce5d0>
-1485: <buddy.compiler.graph.operation.AddOp object at 0x7eff172ce600>
-1486: <buddy.compiler.graph.operation.MulOp object at 0x7eff172ce630>
-1487: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172ce660>
-1488: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172ce690>
-1489: <buddy.compiler.graph.operation.NegOp object at 0x7eff172ce6c0>
-1490: <buddy.compiler.graph.operation.CatOp object at 0x7eff172ce6f0>
-1491: <buddy.compiler.graph.operation.MulOp object at 0x7eff172ce720>
-1492: <buddy.compiler.graph.operation.AddOp object at 0x7eff172ce750>
-1493: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172ce780>
-1494: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172ce7b0>
-1495: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172ce7e0>
-1496: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172ce810>
-1497: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff172ce840>
-1498: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172ce870>
-1499: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172ce8a0>
-1500: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172ce8d0>
-1501: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172ce900>
-1502: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff172ce930>
-1503: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff172ce990>
-1504: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172ce960>
-1505: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ce9c0>
-1506: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172ce9f0>
-1507: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cea20>
-1508: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cea50>
-1509: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cea80>
-1510: <buddy.compiler.graph.operation.AddOp object at 0x7eff172ceab0>
-1511: <buddy.compiler.graph.operation.PowOp object at 0x7eff172ceae0>
-1512: <buddy.compiler.graph.operation.MeanOp object at 0x7eff172ceb10>
-1513: <buddy.compiler.graph.operation.AddOp object at 0x7eff172ceb40>
-1514: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff172ceb70>
-1515: <buddy.compiler.graph.operation.MulOp object at 0x7eff172ceba0>
-1516: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cebd0>
-1517: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cec00>
-1518: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cec30>
-1519: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cec60>
-1520: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cec90>
-1521: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff172cecc0>
-1522: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cecf0>
-1523: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172ced20>
-1524: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ced50>
-1525: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172ced80>
-1526: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cedb0>
-1527: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cede0>
-1528: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cee10>
-1529: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cee40>
-1530: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cee70>
-1531: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172ceea0>
-1532: <buddy.compiler.graph.operation.AddOp object at 0x7eff172ceed0>
-1533: <buddy.compiler.graph.operation.PowOp object at 0x7eff172cef00>
-1534: <buddy.compiler.graph.operation.MeanOp object at 0x7eff172cef30>
-1535: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cef60>
-1536: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff172cef90>
-1537: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cefc0>
-1538: <buddy.compiler.graph.operation.MulOp object at 0x7eff172ceff0>
-1539: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cf020>
-1540: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cf050>
-1541: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cf080>
-1542: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cf0b0>
-1543: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cf0e0>
-1544: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cf110>
-1545: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cf140>
-1546: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cf170>
-1547: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cf1a0>
-1548: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cf1d0>
-1549: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cf200>
-1550: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cf230>
-1551: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cf260>
-1552: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cf290>
-1553: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cf2c0>
-1554: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cf2f0>
-1555: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cf320>
-1556: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cf350>
-1557: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172cf380>
-1558: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172cf3b0>
-1559: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cf3e0>
-1560: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cf410>
-1561: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cf440>
-1562: <buddy.compiler.graph.operation.NegOp object at 0x7eff172cf470>
-1563: <buddy.compiler.graph.operation.CatOp object at 0x7eff172cf4a0>
-1564: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cf4d0>
-1565: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cf500>
-1566: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cf530>
-1567: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cf560>
-1568: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cf590>
-1569: <buddy.compiler.graph.operation.NegOp object at 0x7eff172cf5c0>
-1570: <buddy.compiler.graph.operation.CatOp object at 0x7eff172cf5f0>
-1571: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cf620>
-1572: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cf650>
-1573: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172cf680>
-1574: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff172cf6b0>
-1575: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cf6e0>
-1576: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cf710>
-1577: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff172cf740>
-1578: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cf770>
-1579: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cf7a0>
-1580: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cf7d0>
-1581: <buddy.compiler.graph.operation.SliceOp object at 0x7eff172cf800>
-1582: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff172cf830>
-1583: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff172cf890>
-1584: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cf860>
-1585: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cf8c0>
-1586: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cf8f0>
-1587: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cf920>
-1588: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cf950>
-1589: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cf980>
-1590: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cf9b0>
-1591: <buddy.compiler.graph.operation.PowOp object at 0x7eff172cf9e0>
-1592: <buddy.compiler.graph.operation.MeanOp object at 0x7eff172cfa10>
-1593: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cfa40>
-1594: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff172cfa70>
-1595: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cfaa0>
-1596: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cfad0>
-1597: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cfb00>
-1598: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cfb30>
-1599: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cfb60>
-1600: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cfb90>
-1601: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff172cfbc0>
-1602: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cfbf0>
-1603: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cfc20>
-1604: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cfc50>
-1605: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cfc80>
-1606: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cfcb0>
-1607: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cfce0>
-1608: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cfd10>
-1609: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cfd40>
-1610: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cfd70>
-1611: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cfda0>
-1612: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cfdd0>
-1613: <buddy.compiler.graph.operation.PowOp object at 0x7eff172cfe00>
-1614: <buddy.compiler.graph.operation.MeanOp object at 0x7eff172cfe30>
-1615: <buddy.compiler.graph.operation.AddOp object at 0x7eff172cfe60>
-1616: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff172cfe90>
-1617: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cfec0>
-1618: <buddy.compiler.graph.operation.MulOp object at 0x7eff172cfef0>
-1619: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cff20>
-1620: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cff50>
-1621: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff172cff80>
-1622: <buddy.compiler.graph.operation.ViewOp object at 0x7eff172cffb0>
-1623: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff172cffe0>
-1624: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110050>
-1625: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17110080>
-1626: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171100b0>
-1627: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171100e0>
-1628: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110110>
-1629: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17110140>
-1630: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110170>
-1631: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171101a0>
-1632: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171101d0>
-1633: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110200>
-1634: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17110230>
-1635: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110260>
-1636: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17110290>
-1637: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171102c0>
-1638: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171102f0>
-1639: <buddy.compiler.graph.operation.MulOp object at 0x7eff17110320>
-1640: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17110350>
-1641: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17110380>
-1642: <buddy.compiler.graph.operation.NegOp object at 0x7eff171103b0>
-1643: <buddy.compiler.graph.operation.CatOp object at 0x7eff171103e0>
-1644: <buddy.compiler.graph.operation.MulOp object at 0x7eff17110410>
-1645: <buddy.compiler.graph.operation.AddOp object at 0x7eff17110440>
-1646: <buddy.compiler.graph.operation.MulOp object at 0x7eff17110470>
-1647: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171104a0>
-1648: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171104d0>
-1649: <buddy.compiler.graph.operation.NegOp object at 0x7eff17110500>
-1650: <buddy.compiler.graph.operation.CatOp object at 0x7eff17110530>
-1651: <buddy.compiler.graph.operation.MulOp object at 0x7eff17110560>
-1652: <buddy.compiler.graph.operation.AddOp object at 0x7eff17110590>
-1653: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171105c0>
-1654: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171105f0>
-1655: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17110620>
-1656: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17110650>
-1657: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff17110680>
-1658: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171106b0>
-1659: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171106e0>
-1660: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17110710>
-1661: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17110740>
-1662: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff17110770>
-1663: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff171107d0>
-1664: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171107a0>
-1665: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110800>
-1666: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17110830>
-1667: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110860>
-1668: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17110890>
-1669: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171108c0>
-1670: <buddy.compiler.graph.operation.AddOp object at 0x7eff171108f0>
-1671: <buddy.compiler.graph.operation.PowOp object at 0x7eff17110920>
-1672: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17110950>
-1673: <buddy.compiler.graph.operation.AddOp object at 0x7eff17110980>
-1674: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff171109b0>
-1675: <buddy.compiler.graph.operation.MulOp object at 0x7eff171109e0>
-1676: <buddy.compiler.graph.operation.MulOp object at 0x7eff17110a10>
-1677: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17110a40>
-1678: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110a70>
-1679: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17110aa0>
-1680: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110ad0>
-1681: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17110b00>
-1682: <buddy.compiler.graph.operation.MulOp object at 0x7eff17110b30>
-1683: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17110b60>
-1684: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110b90>
-1685: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17110bc0>
-1686: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110bf0>
-1687: <buddy.compiler.graph.operation.MulOp object at 0x7eff17110c20>
-1688: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17110c50>
-1689: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110c80>
-1690: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17110cb0>
-1691: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110ce0>
-1692: <buddy.compiler.graph.operation.AddOp object at 0x7eff17110d10>
-1693: <buddy.compiler.graph.operation.PowOp object at 0x7eff17110d40>
-1694: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17110d70>
-1695: <buddy.compiler.graph.operation.AddOp object at 0x7eff17110da0>
-1696: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17110dd0>
-1697: <buddy.compiler.graph.operation.MulOp object at 0x7eff17110e00>
-1698: <buddy.compiler.graph.operation.MulOp object at 0x7eff17110e30>
-1699: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17110e60>
-1700: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110e90>
-1701: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17110ec0>
-1702: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110ef0>
-1703: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17110f20>
-1704: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110f50>
-1705: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17110f80>
-1706: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17110fb0>
-1707: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17110fe0>
-1708: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111010>
-1709: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17111040>
-1710: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111070>
-1711: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171110a0>
-1712: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171110d0>
-1713: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111100>
-1714: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17111130>
-1715: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111160>
-1716: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17111190>
-1717: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171111c0>
-1718: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171111f0>
-1719: <buddy.compiler.graph.operation.MulOp object at 0x7eff17111220>
-1720: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17111250>
-1721: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17111280>
-1722: <buddy.compiler.graph.operation.NegOp object at 0x7eff171112b0>
-1723: <buddy.compiler.graph.operation.CatOp object at 0x7eff171112e0>
-1724: <buddy.compiler.graph.operation.MulOp object at 0x7eff17111310>
-1725: <buddy.compiler.graph.operation.AddOp object at 0x7eff17111340>
-1726: <buddy.compiler.graph.operation.MulOp object at 0x7eff17111370>
-1727: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171113a0>
-1728: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171113d0>
-1729: <buddy.compiler.graph.operation.NegOp object at 0x7eff17111400>
-1730: <buddy.compiler.graph.operation.CatOp object at 0x7eff17111430>
-1731: <buddy.compiler.graph.operation.MulOp object at 0x7eff17111460>
-1732: <buddy.compiler.graph.operation.AddOp object at 0x7eff17111490>
-1733: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171114c0>
-1734: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171114f0>
-1735: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17111520>
-1736: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17111550>
-1737: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff17111580>
-1738: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171115b0>
-1739: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171115e0>
-1740: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17111610>
-1741: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17111640>
-1742: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff17111670>
-1743: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff171116d0>
-1744: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171116a0>
-1745: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111700>
-1746: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17111730>
-1747: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111760>
-1748: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17111790>
-1749: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171117c0>
-1750: <buddy.compiler.graph.operation.AddOp object at 0x7eff171117f0>
-1751: <buddy.compiler.graph.operation.PowOp object at 0x7eff17111820>
-1752: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17111850>
-1753: <buddy.compiler.graph.operation.AddOp object at 0x7eff17111880>
-1754: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff171118b0>
-1755: <buddy.compiler.graph.operation.MulOp object at 0x7eff171118e0>
-1756: <buddy.compiler.graph.operation.MulOp object at 0x7eff17111910>
-1757: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17111940>
-1758: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111970>
-1759: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171119a0>
-1760: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171119d0>
-1761: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17111a00>
-1762: <buddy.compiler.graph.operation.MulOp object at 0x7eff17111a30>
-1763: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17111a60>
-1764: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111a90>
-1765: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17111ac0>
-1766: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111af0>
-1767: <buddy.compiler.graph.operation.MulOp object at 0x7eff17111b20>
-1768: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17111b50>
-1769: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111b80>
-1770: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17111bb0>
-1771: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111be0>
-1772: <buddy.compiler.graph.operation.AddOp object at 0x7eff17111c10>
-1773: <buddy.compiler.graph.operation.PowOp object at 0x7eff17111c40>
-1774: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17111c70>
-1775: <buddy.compiler.graph.operation.AddOp object at 0x7eff17111ca0>
-1776: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17111cd0>
-1777: <buddy.compiler.graph.operation.MulOp object at 0x7eff17111d00>
-1778: <buddy.compiler.graph.operation.MulOp object at 0x7eff17111d30>
-1779: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17111d60>
-1780: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111d90>
-1781: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17111dc0>
-1782: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111df0>
-1783: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17111e20>
-1784: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111e50>
-1785: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17111e80>
-1786: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111eb0>
-1787: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17111ee0>
-1788: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111f10>
-1789: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17111f40>
-1790: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111f70>
-1791: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17111fa0>
-1792: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17111fd0>
-1793: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112000>
-1794: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17112030>
-1795: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112060>
-1796: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17112090>
-1797: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171120c0>
-1798: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171120f0>
-1799: <buddy.compiler.graph.operation.MulOp object at 0x7eff17112120>
-1800: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17112150>
-1801: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17112180>
-1802: <buddy.compiler.graph.operation.NegOp object at 0x7eff171121b0>
-1803: <buddy.compiler.graph.operation.CatOp object at 0x7eff171121e0>
-1804: <buddy.compiler.graph.operation.MulOp object at 0x7eff17112210>
-1805: <buddy.compiler.graph.operation.AddOp object at 0x7eff17112240>
-1806: <buddy.compiler.graph.operation.MulOp object at 0x7eff17112270>
-1807: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171122a0>
-1808: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171122d0>
-1809: <buddy.compiler.graph.operation.NegOp object at 0x7eff17112300>
-1810: <buddy.compiler.graph.operation.CatOp object at 0x7eff17112330>
-1811: <buddy.compiler.graph.operation.MulOp object at 0x7eff17112360>
-1812: <buddy.compiler.graph.operation.AddOp object at 0x7eff17112390>
-1813: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171123c0>
-1814: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171123f0>
-1815: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17112420>
-1816: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17112450>
-1817: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff17112480>
-1818: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171124b0>
-1819: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171124e0>
-1820: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17112510>
-1821: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17112540>
-1822: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff17112570>
-1823: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff171125d0>
-1824: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171125a0>
-1825: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112600>
-1826: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17112630>
-1827: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112660>
-1828: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17112690>
-1829: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171126c0>
-1830: <buddy.compiler.graph.operation.AddOp object at 0x7eff171126f0>
-1831: <buddy.compiler.graph.operation.PowOp object at 0x7eff17112720>
-1832: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17112750>
-1833: <buddy.compiler.graph.operation.AddOp object at 0x7eff17112780>
-1834: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff171127b0>
-1835: <buddy.compiler.graph.operation.MulOp object at 0x7eff171127e0>
-1836: <buddy.compiler.graph.operation.MulOp object at 0x7eff17112810>
-1837: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17112840>
-1838: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112870>
-1839: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171128a0>
-1840: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171128d0>
-1841: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17112900>
-1842: <buddy.compiler.graph.operation.MulOp object at 0x7eff17112930>
-1843: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17112960>
-1844: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112990>
-1845: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171129c0>
-1846: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171129f0>
-1847: <buddy.compiler.graph.operation.MulOp object at 0x7eff17112a20>
-1848: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17112a50>
-1849: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112a80>
-1850: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17112ab0>
-1851: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112ae0>
-1852: <buddy.compiler.graph.operation.AddOp object at 0x7eff17112b10>
-1853: <buddy.compiler.graph.operation.PowOp object at 0x7eff17112b40>
-1854: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17112b70>
-1855: <buddy.compiler.graph.operation.AddOp object at 0x7eff17112ba0>
-1856: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17112bd0>
-1857: <buddy.compiler.graph.operation.MulOp object at 0x7eff17112c00>
-1858: <buddy.compiler.graph.operation.MulOp object at 0x7eff17112c30>
-1859: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17112c60>
-1860: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112c90>
-1861: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17112cc0>
-1862: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112cf0>
-1863: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17112d20>
-1864: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112d50>
-1865: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17112d80>
-1866: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112db0>
-1867: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17112de0>
-1868: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112e10>
-1869: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17112e40>
-1870: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112e70>
-1871: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112ea0>
-1872: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17112ed0>
-1873: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112f00>
-1874: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17112f30>
-1875: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17112f60>
-1876: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17112f90>
-1877: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17112fc0>
-1878: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17112ff0>
-1879: <buddy.compiler.graph.operation.MulOp object at 0x7eff17113020>
-1880: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17113050>
-1881: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17113080>
-1882: <buddy.compiler.graph.operation.NegOp object at 0x7eff171130b0>
-1883: <buddy.compiler.graph.operation.CatOp object at 0x7eff171130e0>
-1884: <buddy.compiler.graph.operation.MulOp object at 0x7eff17113110>
-1885: <buddy.compiler.graph.operation.AddOp object at 0x7eff17113140>
-1886: <buddy.compiler.graph.operation.MulOp object at 0x7eff17113170>
-1887: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171131a0>
-1888: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171131d0>
-1889: <buddy.compiler.graph.operation.NegOp object at 0x7eff17113200>
-1890: <buddy.compiler.graph.operation.CatOp object at 0x7eff17113230>
-1891: <buddy.compiler.graph.operation.MulOp object at 0x7eff17113260>
-1892: <buddy.compiler.graph.operation.AddOp object at 0x7eff17113290>
-1893: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171132c0>
-1894: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171132f0>
-1895: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17113320>
-1896: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17113350>
-1897: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff17113380>
-1898: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171133b0>
-1899: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171133e0>
-1900: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17113410>
-1901: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17113440>
-1902: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff17113470>
-1903: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff171134d0>
-1904: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171134a0>
-1905: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17113500>
-1906: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17113530>
-1907: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17113560>
-1908: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17113590>
-1909: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171135c0>
-1910: <buddy.compiler.graph.operation.AddOp object at 0x7eff171135f0>
-1911: <buddy.compiler.graph.operation.PowOp object at 0x7eff17113620>
-1912: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17113650>
-1913: <buddy.compiler.graph.operation.AddOp object at 0x7eff17113680>
-1914: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff171136b0>
-1915: <buddy.compiler.graph.operation.MulOp object at 0x7eff171136e0>
-1916: <buddy.compiler.graph.operation.MulOp object at 0x7eff17113710>
-1917: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17113740>
-1918: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17113770>
-1919: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171137a0>
-1920: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171137d0>
-1921: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17113800>
-1922: <buddy.compiler.graph.operation.MulOp object at 0x7eff17113830>
-1923: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17113860>
-1924: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17113890>
-1925: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171138c0>
-1926: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171138f0>
-1927: <buddy.compiler.graph.operation.MulOp object at 0x7eff17113920>
-1928: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17113950>
-1929: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17113980>
-1930: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171139b0>
-1931: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171139e0>
-1932: <buddy.compiler.graph.operation.AddOp object at 0x7eff17113a10>
-1933: <buddy.compiler.graph.operation.PowOp object at 0x7eff17113a40>
-1934: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17113a70>
-1935: <buddy.compiler.graph.operation.AddOp object at 0x7eff17113aa0>
-1936: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17113ad0>
-1937: <buddy.compiler.graph.operation.MulOp object at 0x7eff17113b00>
-1938: <buddy.compiler.graph.operation.MulOp object at 0x7eff17113b30>
-1939: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17113b60>
-1940: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17113b90>
-1941: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17113bc0>
-1942: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17113bf0>
-1943: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17113c20>
-1944: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17113c50>
-1945: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17113c80>
-1946: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17113cb0>
-1947: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17113ce0>
-1948: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17113d10>
-1949: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17113d40>
-1950: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17113d70>
-1951: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17113da0>
-1952: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17113dd0>
-1953: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17113e00>
-1954: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17113e30>
-1955: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17113e60>
-1956: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17113e90>
-1957: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17113ec0>
-1958: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17113ef0>
-1959: <buddy.compiler.graph.operation.MulOp object at 0x7eff17113f20>
-1960: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17113f50>
-1961: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17113f80>
-1962: <buddy.compiler.graph.operation.NegOp object at 0x7eff17113fb0>
-1963: <buddy.compiler.graph.operation.CatOp object at 0x7eff17113fe0>
-1964: <buddy.compiler.graph.operation.MulOp object at 0x7eff17150050>
-1965: <buddy.compiler.graph.operation.AddOp object at 0x7eff17150080>
-1966: <buddy.compiler.graph.operation.MulOp object at 0x7eff171500b0>
-1967: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171500e0>
-1968: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17150110>
-1969: <buddy.compiler.graph.operation.NegOp object at 0x7eff17150140>
-1970: <buddy.compiler.graph.operation.CatOp object at 0x7eff17150170>
-1971: <buddy.compiler.graph.operation.MulOp object at 0x7eff171501a0>
-1972: <buddy.compiler.graph.operation.AddOp object at 0x7eff171501d0>
-1973: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17150200>
-1974: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17150230>
-1975: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17150260>
-1976: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17150290>
-1977: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff171502c0>
-1978: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171502f0>
-1979: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17150320>
-1980: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17150350>
-1981: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17150380>
-1982: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff171503b0>
-1983: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff17150410>
-1984: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171503e0>
-1985: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17150440>
-1986: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17150470>
-1987: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171504a0>
-1988: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171504d0>
-1989: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17150500>
-1990: <buddy.compiler.graph.operation.AddOp object at 0x7eff17150530>
-1991: <buddy.compiler.graph.operation.PowOp object at 0x7eff17150560>
-1992: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17150590>
-1993: <buddy.compiler.graph.operation.AddOp object at 0x7eff171505c0>
-1994: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff171505f0>
-1995: <buddy.compiler.graph.operation.MulOp object at 0x7eff17150620>
-1996: <buddy.compiler.graph.operation.MulOp object at 0x7eff17150650>
-1997: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17150680>
-1998: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171506b0>
-1999: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171506e0>
-2000: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17150710>
-2001: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17150740>
-2002: <buddy.compiler.graph.operation.MulOp object at 0x7eff17150770>
-2003: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171507a0>
-2004: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171507d0>
-2005: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17150800>
-2006: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17150830>
-2007: <buddy.compiler.graph.operation.MulOp object at 0x7eff17150860>
-2008: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17150890>
-2009: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171508c0>
-2010: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171508f0>
-2011: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17150920>
-2012: <buddy.compiler.graph.operation.AddOp object at 0x7eff17150950>
-2013: <buddy.compiler.graph.operation.PowOp object at 0x7eff17150980>
-2014: <buddy.compiler.graph.operation.MeanOp object at 0x7eff171509b0>
-2015: <buddy.compiler.graph.operation.AddOp object at 0x7eff171509e0>
-2016: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17150a10>
-2017: <buddy.compiler.graph.operation.MulOp object at 0x7eff17150a40>
-2018: <buddy.compiler.graph.operation.MulOp object at 0x7eff17150a70>
-2019: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17150aa0>
-2020: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17150ad0>
-2021: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17150b00>
-2022: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17150b30>
-2023: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17150b60>
-2024: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17150b90>
-2025: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17150bc0>
-2026: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17150bf0>
-2027: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17150c20>
-2028: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17150c50>
-2029: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17150c80>
-2030: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17150cb0>
-2031: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17150ce0>
-2032: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17150d10>
-2033: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17150d40>
-2034: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17150d70>
-2035: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17150da0>
-2036: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17150dd0>
-2037: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17150e00>
-2038: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17150e30>
-2039: <buddy.compiler.graph.operation.MulOp object at 0x7eff17150e60>
-2040: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17150e90>
-2041: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17150ec0>
-2042: <buddy.compiler.graph.operation.NegOp object at 0x7eff17150ef0>
-2043: <buddy.compiler.graph.operation.CatOp object at 0x7eff17150f20>
-2044: <buddy.compiler.graph.operation.MulOp object at 0x7eff17150f50>
-2045: <buddy.compiler.graph.operation.AddOp object at 0x7eff17150f80>
-2046: <buddy.compiler.graph.operation.MulOp object at 0x7eff17150fb0>
-2047: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17150fe0>
-2048: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17151010>
-2049: <buddy.compiler.graph.operation.NegOp object at 0x7eff17151040>
-2050: <buddy.compiler.graph.operation.CatOp object at 0x7eff17151070>
-2051: <buddy.compiler.graph.operation.MulOp object at 0x7eff171510a0>
-2052: <buddy.compiler.graph.operation.AddOp object at 0x7eff171510d0>
-2053: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17151100>
-2054: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17151130>
-2055: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17151160>
-2056: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17151190>
-2057: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff171511c0>
-2058: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171511f0>
-2059: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17151220>
-2060: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17151250>
-2061: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17151280>
-2062: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff171512b0>
-2063: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff17151310>
-2064: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171512e0>
-2065: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17151340>
-2066: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17151370>
-2067: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171513a0>
-2068: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171513d0>
-2069: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17151400>
-2070: <buddy.compiler.graph.operation.AddOp object at 0x7eff17151430>
-2071: <buddy.compiler.graph.operation.PowOp object at 0x7eff17151460>
-2072: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17151490>
-2073: <buddy.compiler.graph.operation.AddOp object at 0x7eff171514c0>
-2074: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff171514f0>
-2075: <buddy.compiler.graph.operation.MulOp object at 0x7eff17151520>
-2076: <buddy.compiler.graph.operation.MulOp object at 0x7eff17151550>
-2077: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17151580>
-2078: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171515b0>
-2079: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171515e0>
-2080: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17151610>
-2081: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17151640>
-2082: <buddy.compiler.graph.operation.MulOp object at 0x7eff17151670>
-2083: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171516a0>
-2084: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171516d0>
-2085: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17151700>
-2086: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17151730>
-2087: <buddy.compiler.graph.operation.MulOp object at 0x7eff17151760>
-2088: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17151790>
-2089: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171517c0>
-2090: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171517f0>
-2091: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17151820>
-2092: <buddy.compiler.graph.operation.AddOp object at 0x7eff17151850>
-2093: <buddy.compiler.graph.operation.PowOp object at 0x7eff17151880>
-2094: <buddy.compiler.graph.operation.MeanOp object at 0x7eff171518b0>
-2095: <buddy.compiler.graph.operation.AddOp object at 0x7eff171518e0>
-2096: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17151910>
-2097: <buddy.compiler.graph.operation.MulOp object at 0x7eff17151940>
-2098: <buddy.compiler.graph.operation.MulOp object at 0x7eff17151970>
-2099: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171519a0>
-2100: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171519d0>
-2101: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17151a00>
-2102: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17151a30>
-2103: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17151a60>
-2104: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17151a90>
-2105: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17151ac0>
-2106: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17151af0>
-2107: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17151b20>
-2108: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17151b50>
-2109: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17151b80>
-2110: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17151bb0>
-2111: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17151be0>
-2112: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17151c10>
-2113: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17151c40>
-2114: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17151c70>
-2115: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17151ca0>
-2116: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17151cd0>
-2117: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17151d00>
-2118: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17151d30>
-2119: <buddy.compiler.graph.operation.MulOp object at 0x7eff17151d60>
-2120: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17151d90>
-2121: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17151dc0>
-2122: <buddy.compiler.graph.operation.NegOp object at 0x7eff17151df0>
-2123: <buddy.compiler.graph.operation.CatOp object at 0x7eff17151e20>
-2124: <buddy.compiler.graph.operation.MulOp object at 0x7eff17151e50>
-2125: <buddy.compiler.graph.operation.AddOp object at 0x7eff17151e80>
-2126: <buddy.compiler.graph.operation.MulOp object at 0x7eff17151eb0>
-2127: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17151ee0>
-2128: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17151f10>
-2129: <buddy.compiler.graph.operation.NegOp object at 0x7eff17151f40>
-2130: <buddy.compiler.graph.operation.CatOp object at 0x7eff17151f70>
-2131: <buddy.compiler.graph.operation.MulOp object at 0x7eff17151fa0>
-2132: <buddy.compiler.graph.operation.AddOp object at 0x7eff17151fd0>
-2133: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17152000>
-2134: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17152030>
-2135: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17152060>
-2136: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17152090>
-2137: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff171520c0>
-2138: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171520f0>
-2139: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17152120>
-2140: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17152150>
-2141: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17152180>
-2142: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff171521b0>
-2143: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff17152210>
-2144: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171521e0>
-2145: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17152240>
-2146: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17152270>
-2147: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171522a0>
-2148: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171522d0>
-2149: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17152300>
-2150: <buddy.compiler.graph.operation.AddOp object at 0x7eff17152330>
-2151: <buddy.compiler.graph.operation.PowOp object at 0x7eff17152360>
-2152: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17152390>
-2153: <buddy.compiler.graph.operation.AddOp object at 0x7eff171523c0>
-2154: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff171523f0>
-2155: <buddy.compiler.graph.operation.MulOp object at 0x7eff17152420>
-2156: <buddy.compiler.graph.operation.MulOp object at 0x7eff17152450>
-2157: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17152480>
-2158: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171524b0>
-2159: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171524e0>
-2160: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17152510>
-2161: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17152540>
-2162: <buddy.compiler.graph.operation.MulOp object at 0x7eff17152570>
-2163: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171525a0>
-2164: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171525d0>
-2165: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17152600>
-2166: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17152630>
-2167: <buddy.compiler.graph.operation.MulOp object at 0x7eff17152660>
-2168: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17152690>
-2169: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171526c0>
-2170: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171526f0>
-2171: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17152720>
-2172: <buddy.compiler.graph.operation.AddOp object at 0x7eff17152750>
-2173: <buddy.compiler.graph.operation.PowOp object at 0x7eff17152780>
-2174: <buddy.compiler.graph.operation.MeanOp object at 0x7eff171527b0>
-2175: <buddy.compiler.graph.operation.AddOp object at 0x7eff171527e0>
-2176: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17152810>
-2177: <buddy.compiler.graph.operation.MulOp object at 0x7eff17152840>
-2178: <buddy.compiler.graph.operation.MulOp object at 0x7eff17152870>
-2179: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171528a0>
-2180: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171528d0>
-2181: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17152900>
-2182: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17152930>
-2183: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17152960>
-2184: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17152990>
-2185: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171529c0>
-2186: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171529f0>
-2187: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17152a20>
-2188: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17152a50>
-2189: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17152a80>
-2190: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17152ab0>
-2191: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17152ae0>
-2192: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17152b10>
-2193: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17152b40>
-2194: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17152b70>
-2195: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17152ba0>
-2196: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17152bd0>
-2197: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17152c00>
-2198: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17152c30>
-2199: <buddy.compiler.graph.operation.MulOp object at 0x7eff17152c60>
-2200: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17152c90>
-2201: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17152cc0>
-2202: <buddy.compiler.graph.operation.NegOp object at 0x7eff17152cf0>
-2203: <buddy.compiler.graph.operation.CatOp object at 0x7eff17152d20>
-2204: <buddy.compiler.graph.operation.MulOp object at 0x7eff17152d50>
-2205: <buddy.compiler.graph.operation.AddOp object at 0x7eff17152d80>
-2206: <buddy.compiler.graph.operation.MulOp object at 0x7eff17152db0>
-2207: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17152de0>
-2208: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17152e10>
-2209: <buddy.compiler.graph.operation.NegOp object at 0x7eff17152e40>
-2210: <buddy.compiler.graph.operation.CatOp object at 0x7eff17152e70>
-2211: <buddy.compiler.graph.operation.MulOp object at 0x7eff17152ea0>
-2212: <buddy.compiler.graph.operation.AddOp object at 0x7eff17152ed0>
-2213: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17152f00>
-2214: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17152f30>
-2215: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17152f60>
-2216: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17152f90>
-2217: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff17152fc0>
-2218: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17152ff0>
-2219: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17153020>
-2220: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17153050>
-2221: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17153080>
-2222: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff171530b0>
-2223: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff17153110>
-2224: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171530e0>
-2225: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17153140>
-2226: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17153170>
-2227: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171531a0>
-2228: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171531d0>
-2229: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17153200>
-2230: <buddy.compiler.graph.operation.AddOp object at 0x7eff17153230>
-2231: <buddy.compiler.graph.operation.PowOp object at 0x7eff17153260>
-2232: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17153290>
-2233: <buddy.compiler.graph.operation.AddOp object at 0x7eff171532c0>
-2234: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff171532f0>
-2235: <buddy.compiler.graph.operation.MulOp object at 0x7eff17153320>
-2236: <buddy.compiler.graph.operation.MulOp object at 0x7eff17153350>
-2237: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17153380>
-2238: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171533b0>
-2239: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171533e0>
-2240: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17153410>
-2241: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17153440>
-2242: <buddy.compiler.graph.operation.MulOp object at 0x7eff17153470>
-2243: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171534a0>
-2244: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171534d0>
-2245: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17153500>
-2246: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17153530>
-2247: <buddy.compiler.graph.operation.MulOp object at 0x7eff17153560>
-2248: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17153590>
-2249: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171535c0>
-2250: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171535f0>
-2251: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17153620>
-2252: <buddy.compiler.graph.operation.AddOp object at 0x7eff17153650>
-2253: <buddy.compiler.graph.operation.PowOp object at 0x7eff17153680>
-2254: <buddy.compiler.graph.operation.MeanOp object at 0x7eff171536b0>
-2255: <buddy.compiler.graph.operation.AddOp object at 0x7eff171536e0>
-2256: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17153710>
-2257: <buddy.compiler.graph.operation.MulOp object at 0x7eff17153740>
-2258: <buddy.compiler.graph.operation.MulOp object at 0x7eff17153770>
-2259: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171537a0>
-2260: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171537d0>
-2261: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17153800>
-2262: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17153830>
-2263: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17153860>
-2264: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17153890>
-2265: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171538c0>
-2266: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171538f0>
-2267: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17153920>
-2268: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17153950>
-2269: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17153980>
-2270: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171539b0>
-2271: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171539e0>
-2272: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17153a10>
-2273: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17153a40>
-2274: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17153a70>
-2275: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17153aa0>
-2276: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17153ad0>
-2277: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17153b00>
-2278: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17153b30>
-2279: <buddy.compiler.graph.operation.MulOp object at 0x7eff17153b60>
-2280: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17153b90>
-2281: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17153bc0>
-2282: <buddy.compiler.graph.operation.NegOp object at 0x7eff17153bf0>
-2283: <buddy.compiler.graph.operation.CatOp object at 0x7eff17153c20>
-2284: <buddy.compiler.graph.operation.MulOp object at 0x7eff17153c50>
-2285: <buddy.compiler.graph.operation.AddOp object at 0x7eff17153c80>
-2286: <buddy.compiler.graph.operation.MulOp object at 0x7eff17153cb0>
-2287: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17153ce0>
-2288: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17153d10>
-2289: <buddy.compiler.graph.operation.NegOp object at 0x7eff17153d40>
-2290: <buddy.compiler.graph.operation.CatOp object at 0x7eff17153d70>
-2291: <buddy.compiler.graph.operation.MulOp object at 0x7eff17153da0>
-2292: <buddy.compiler.graph.operation.AddOp object at 0x7eff17153dd0>
-2293: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17153e00>
-2294: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17153e30>
-2295: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17153e60>
-2296: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17153e90>
-2297: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff17153ec0>
-2298: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17153ef0>
-2299: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17153f20>
-2300: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17153f50>
-2301: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17153f80>
-2302: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff17153fb0>
-2303: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff17194050>
-2304: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17153fe0>
-2305: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194080>
-2306: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171940b0>
-2307: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171940e0>
-2308: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17194110>
-2309: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194140>
-2310: <buddy.compiler.graph.operation.AddOp object at 0x7eff17194170>
-2311: <buddy.compiler.graph.operation.PowOp object at 0x7eff171941a0>
-2312: <buddy.compiler.graph.operation.MeanOp object at 0x7eff171941d0>
-2313: <buddy.compiler.graph.operation.AddOp object at 0x7eff17194200>
-2314: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17194230>
-2315: <buddy.compiler.graph.operation.MulOp object at 0x7eff17194260>
-2316: <buddy.compiler.graph.operation.MulOp object at 0x7eff17194290>
-2317: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171942c0>
-2318: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171942f0>
-2319: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17194320>
-2320: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194350>
-2321: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17194380>
-2322: <buddy.compiler.graph.operation.MulOp object at 0x7eff171943b0>
-2323: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171943e0>
-2324: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194410>
-2325: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17194440>
-2326: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194470>
-2327: <buddy.compiler.graph.operation.MulOp object at 0x7eff171944a0>
-2328: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171944d0>
-2329: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194500>
-2330: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17194530>
-2331: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194560>
-2332: <buddy.compiler.graph.operation.AddOp object at 0x7eff17194590>
-2333: <buddy.compiler.graph.operation.PowOp object at 0x7eff171945c0>
-2334: <buddy.compiler.graph.operation.MeanOp object at 0x7eff171945f0>
-2335: <buddy.compiler.graph.operation.AddOp object at 0x7eff17194620>
-2336: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17194650>
-2337: <buddy.compiler.graph.operation.MulOp object at 0x7eff17194680>
-2338: <buddy.compiler.graph.operation.MulOp object at 0x7eff171946b0>
-2339: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171946e0>
-2340: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194710>
-2341: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17194740>
-2342: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194770>
-2343: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171947a0>
-2344: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171947d0>
-2345: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17194800>
-2346: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194830>
-2347: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17194860>
-2348: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194890>
-2349: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171948c0>
-2350: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171948f0>
-2351: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194920>
-2352: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17194950>
-2353: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194980>
-2354: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171949b0>
-2355: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171949e0>
-2356: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17194a10>
-2357: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17194a40>
-2358: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17194a70>
-2359: <buddy.compiler.graph.operation.MulOp object at 0x7eff17194aa0>
-2360: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17194ad0>
-2361: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17194b00>
-2362: <buddy.compiler.graph.operation.NegOp object at 0x7eff17194b30>
-2363: <buddy.compiler.graph.operation.CatOp object at 0x7eff17194b60>
-2364: <buddy.compiler.graph.operation.MulOp object at 0x7eff17194b90>
-2365: <buddy.compiler.graph.operation.AddOp object at 0x7eff17194bc0>
-2366: <buddy.compiler.graph.operation.MulOp object at 0x7eff17194bf0>
-2367: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17194c20>
-2368: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17194c50>
-2369: <buddy.compiler.graph.operation.NegOp object at 0x7eff17194c80>
-2370: <buddy.compiler.graph.operation.CatOp object at 0x7eff17194cb0>
-2371: <buddy.compiler.graph.operation.MulOp object at 0x7eff17194ce0>
-2372: <buddy.compiler.graph.operation.AddOp object at 0x7eff17194d10>
-2373: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17194d40>
-2374: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17194d70>
-2375: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17194da0>
-2376: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17194dd0>
-2377: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff17194e00>
-2378: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17194e30>
-2379: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17194e60>
-2380: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17194e90>
-2381: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17194ec0>
-2382: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff17194ef0>
-2383: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff17194f50>
-2384: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17194f20>
-2385: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194f80>
-2386: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17194fb0>
-2387: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17194fe0>
-2388: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17195010>
-2389: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195040>
-2390: <buddy.compiler.graph.operation.AddOp object at 0x7eff17195070>
-2391: <buddy.compiler.graph.operation.PowOp object at 0x7eff171950a0>
-2392: <buddy.compiler.graph.operation.MeanOp object at 0x7eff171950d0>
-2393: <buddy.compiler.graph.operation.AddOp object at 0x7eff17195100>
-2394: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17195130>
-2395: <buddy.compiler.graph.operation.MulOp object at 0x7eff17195160>
-2396: <buddy.compiler.graph.operation.MulOp object at 0x7eff17195190>
-2397: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171951c0>
-2398: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171951f0>
-2399: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17195220>
-2400: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195250>
-2401: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17195280>
-2402: <buddy.compiler.graph.operation.MulOp object at 0x7eff171952b0>
-2403: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171952e0>
-2404: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195310>
-2405: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17195340>
-2406: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195370>
-2407: <buddy.compiler.graph.operation.MulOp object at 0x7eff171953a0>
-2408: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171953d0>
-2409: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195400>
-2410: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17195430>
-2411: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195460>
-2412: <buddy.compiler.graph.operation.AddOp object at 0x7eff17195490>
-2413: <buddy.compiler.graph.operation.PowOp object at 0x7eff171954c0>
-2414: <buddy.compiler.graph.operation.MeanOp object at 0x7eff171954f0>
-2415: <buddy.compiler.graph.operation.AddOp object at 0x7eff17195520>
-2416: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17195550>
-2417: <buddy.compiler.graph.operation.MulOp object at 0x7eff17195580>
-2418: <buddy.compiler.graph.operation.MulOp object at 0x7eff171955b0>
-2419: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171955e0>
-2420: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195610>
-2421: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17195640>
-2422: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195670>
-2423: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171956a0>
-2424: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171956d0>
-2425: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17195700>
-2426: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195730>
-2427: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17195760>
-2428: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195790>
-2429: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171957c0>
-2430: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171957f0>
-2431: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195820>
-2432: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17195850>
-2433: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195880>
-2434: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171958b0>
-2435: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171958e0>
-2436: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17195910>
-2437: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17195940>
-2438: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17195970>
-2439: <buddy.compiler.graph.operation.MulOp object at 0x7eff171959a0>
-2440: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171959d0>
-2441: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17195a00>
-2442: <buddy.compiler.graph.operation.NegOp object at 0x7eff17195a30>
-2443: <buddy.compiler.graph.operation.CatOp object at 0x7eff17195a60>
-2444: <buddy.compiler.graph.operation.MulOp object at 0x7eff17195a90>
-2445: <buddy.compiler.graph.operation.AddOp object at 0x7eff17195ac0>
-2446: <buddy.compiler.graph.operation.MulOp object at 0x7eff17195af0>
-2447: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17195b20>
-2448: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17195b50>
-2449: <buddy.compiler.graph.operation.NegOp object at 0x7eff17195b80>
-2450: <buddy.compiler.graph.operation.CatOp object at 0x7eff17195bb0>
-2451: <buddy.compiler.graph.operation.MulOp object at 0x7eff17195be0>
-2452: <buddy.compiler.graph.operation.AddOp object at 0x7eff17195c10>
-2453: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17195c40>
-2454: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17195c70>
-2455: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17195ca0>
-2456: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17195cd0>
-2457: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff17195d00>
-2458: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17195d30>
-2459: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17195d60>
-2460: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17195d90>
-2461: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17195dc0>
-2462: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff17195df0>
-2463: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff17195e50>
-2464: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17195e20>
-2465: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195e80>
-2466: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17195eb0>
-2467: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195ee0>
-2468: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17195f10>
-2469: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17195f40>
-2470: <buddy.compiler.graph.operation.AddOp object at 0x7eff17195f70>
-2471: <buddy.compiler.graph.operation.PowOp object at 0x7eff17195fa0>
-2472: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17195fd0>
-2473: <buddy.compiler.graph.operation.AddOp object at 0x7eff17196000>
-2474: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17196030>
-2475: <buddy.compiler.graph.operation.MulOp object at 0x7eff17196060>
-2476: <buddy.compiler.graph.operation.MulOp object at 0x7eff17196090>
-2477: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171960c0>
-2478: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171960f0>
-2479: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17196120>
-2480: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196150>
-2481: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17196180>
-2482: <buddy.compiler.graph.operation.MulOp object at 0x7eff171961b0>
-2483: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171961e0>
-2484: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196210>
-2485: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17196240>
-2486: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196270>
-2487: <buddy.compiler.graph.operation.MulOp object at 0x7eff171962a0>
-2488: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171962d0>
-2489: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196300>
-2490: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17196330>
-2491: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196360>
-2492: <buddy.compiler.graph.operation.AddOp object at 0x7eff17196390>
-2493: <buddy.compiler.graph.operation.PowOp object at 0x7eff171963c0>
-2494: <buddy.compiler.graph.operation.MeanOp object at 0x7eff171963f0>
-2495: <buddy.compiler.graph.operation.AddOp object at 0x7eff17196420>
-2496: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17196450>
-2497: <buddy.compiler.graph.operation.MulOp object at 0x7eff17196480>
-2498: <buddy.compiler.graph.operation.MulOp object at 0x7eff171964b0>
-2499: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171964e0>
-2500: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196510>
-2501: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17196540>
-2502: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196570>
-2503: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171965a0>
-2504: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171965d0>
-2505: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17196600>
-2506: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196630>
-2507: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17196660>
-2508: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196690>
-2509: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171966c0>
-2510: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171966f0>
-2511: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196720>
-2512: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17196750>
-2513: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196780>
-2514: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171967b0>
-2515: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171967e0>
-2516: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17196810>
-2517: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17196840>
-2518: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17196870>
-2519: <buddy.compiler.graph.operation.MulOp object at 0x7eff171968a0>
-2520: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171968d0>
-2521: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17196900>
-2522: <buddy.compiler.graph.operation.NegOp object at 0x7eff17196930>
-2523: <buddy.compiler.graph.operation.CatOp object at 0x7eff17196960>
-2524: <buddy.compiler.graph.operation.MulOp object at 0x7eff17196990>
-2525: <buddy.compiler.graph.operation.AddOp object at 0x7eff171969c0>
-2526: <buddy.compiler.graph.operation.MulOp object at 0x7eff171969f0>
-2527: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17196a20>
-2528: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17196a50>
-2529: <buddy.compiler.graph.operation.NegOp object at 0x7eff17196a80>
-2530: <buddy.compiler.graph.operation.CatOp object at 0x7eff17196ab0>
-2531: <buddy.compiler.graph.operation.MulOp object at 0x7eff17196ae0>
-2532: <buddy.compiler.graph.operation.AddOp object at 0x7eff17196b10>
-2533: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17196b40>
-2534: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17196b70>
-2535: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17196ba0>
-2536: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17196bd0>
-2537: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff17196c00>
-2538: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17196c30>
-2539: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17196c60>
-2540: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17196c90>
-2541: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17196cc0>
-2542: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff17196cf0>
-2543: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff17196d50>
-2544: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17196d20>
-2545: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196d80>
-2546: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17196db0>
-2547: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196de0>
-2548: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17196e10>
-2549: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196e40>
-2550: <buddy.compiler.graph.operation.AddOp object at 0x7eff17196e70>
-2551: <buddy.compiler.graph.operation.PowOp object at 0x7eff17196ea0>
-2552: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17196ed0>
-2553: <buddy.compiler.graph.operation.AddOp object at 0x7eff17196f00>
-2554: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17196f30>
-2555: <buddy.compiler.graph.operation.MulOp object at 0x7eff17196f60>
-2556: <buddy.compiler.graph.operation.MulOp object at 0x7eff17196f90>
-2557: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17196fc0>
-2558: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17196ff0>
-2559: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17197020>
-2560: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197050>
-2561: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17197080>
-2562: <buddy.compiler.graph.operation.MulOp object at 0x7eff171970b0>
-2563: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171970e0>
-2564: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197110>
-2565: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17197140>
-2566: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197170>
-2567: <buddy.compiler.graph.operation.MulOp object at 0x7eff171971a0>
-2568: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171971d0>
-2569: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197200>
-2570: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17197230>
-2571: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197260>
-2572: <buddy.compiler.graph.operation.AddOp object at 0x7eff17197290>
-2573: <buddy.compiler.graph.operation.PowOp object at 0x7eff171972c0>
-2574: <buddy.compiler.graph.operation.MeanOp object at 0x7eff171972f0>
-2575: <buddy.compiler.graph.operation.AddOp object at 0x7eff17197320>
-2576: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17197350>
-2577: <buddy.compiler.graph.operation.MulOp object at 0x7eff17197380>
-2578: <buddy.compiler.graph.operation.MulOp object at 0x7eff171973b0>
-2579: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171973e0>
-2580: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197410>
-2581: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17197440>
-2582: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197470>
-2583: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171974a0>
-2584: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171974d0>
-2585: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17197500>
-2586: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197530>
-2587: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17197560>
-2588: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197590>
-2589: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171975c0>
-2590: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171975f0>
-2591: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197620>
-2592: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17197650>
-2593: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197680>
-2594: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171976b0>
-2595: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171976e0>
-2596: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17197710>
-2597: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17197740>
-2598: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17197770>
-2599: <buddy.compiler.graph.operation.MulOp object at 0x7eff171977a0>
-2600: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171977d0>
-2601: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17197800>
-2602: <buddy.compiler.graph.operation.NegOp object at 0x7eff17197830>
-2603: <buddy.compiler.graph.operation.CatOp object at 0x7eff17197860>
-2604: <buddy.compiler.graph.operation.MulOp object at 0x7eff17197890>
-2605: <buddy.compiler.graph.operation.AddOp object at 0x7eff171978c0>
-2606: <buddy.compiler.graph.operation.MulOp object at 0x7eff171978f0>
-2607: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17197920>
-2608: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17197950>
-2609: <buddy.compiler.graph.operation.NegOp object at 0x7eff17197980>
-2610: <buddy.compiler.graph.operation.CatOp object at 0x7eff171979b0>
-2611: <buddy.compiler.graph.operation.MulOp object at 0x7eff171979e0>
-2612: <buddy.compiler.graph.operation.AddOp object at 0x7eff17197a10>
-2613: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17197a40>
-2614: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff17197a70>
-2615: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17197aa0>
-2616: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17197ad0>
-2617: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff17197b00>
-2618: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17197b30>
-2619: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17197b60>
-2620: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17197b90>
-2621: <buddy.compiler.graph.operation.SliceOp object at 0x7eff17197bc0>
-2622: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff17197bf0>
-2623: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff17197c50>
-2624: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17197c20>
-2625: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197c80>
-2626: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17197cb0>
-2627: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197ce0>
-2628: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17197d10>
-2629: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197d40>
-2630: <buddy.compiler.graph.operation.AddOp object at 0x7eff17197d70>
-2631: <buddy.compiler.graph.operation.PowOp object at 0x7eff17197da0>
-2632: <buddy.compiler.graph.operation.MeanOp object at 0x7eff17197dd0>
-2633: <buddy.compiler.graph.operation.AddOp object at 0x7eff17197e00>
-2634: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff17197e30>
-2635: <buddy.compiler.graph.operation.MulOp object at 0x7eff17197e60>
-2636: <buddy.compiler.graph.operation.MulOp object at 0x7eff17197e90>
-2637: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17197ec0>
-2638: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197ef0>
-2639: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff17197f20>
-2640: <buddy.compiler.graph.operation.ViewOp object at 0x7eff17197f50>
-2641: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff17197f80>
-2642: <buddy.compiler.graph.operation.MulOp object at 0x7eff17197fb0>
-2643: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff17197fe0>
-2644: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8050>
-2645: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171d8080>
-2646: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d80b0>
-2647: <buddy.compiler.graph.operation.MulOp object at 0x7eff171d80e0>
-2648: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171d8110>
-2649: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8140>
-2650: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171d8170>
-2651: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d81a0>
-2652: <buddy.compiler.graph.operation.AddOp object at 0x7eff171d81d0>
-2653: <buddy.compiler.graph.operation.PowOp object at 0x7eff171d8200>
-2654: <buddy.compiler.graph.operation.MeanOp object at 0x7eff171d8230>
-2655: <buddy.compiler.graph.operation.AddOp object at 0x7eff171d8260>
-2656: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff171d8290>
-2657: <buddy.compiler.graph.operation.MulOp object at 0x7eff171d82c0>
-2658: <buddy.compiler.graph.operation.MulOp object at 0x7eff171d82f0>
-2659: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171d8320>
-2660: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8350>
-2661: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171d8380>
-2662: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d83b0>
-2663: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171d83e0>
-2664: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8410>
-2665: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171d8440>
-2666: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8470>
-2667: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171d84a0>
-2668: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d84d0>
-2669: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171d8500>
-2670: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8530>
-2671: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8560>
-2672: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171d8590>
-2673: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d85c0>
-2674: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171d85f0>
-2675: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8620>
-2676: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171d8650>
-2677: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171d8680>
-2678: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171d86b0>
-2679: <buddy.compiler.graph.operation.MulOp object at 0x7eff171d86e0>
-2680: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171d8710>
-2681: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171d8740>
-2682: <buddy.compiler.graph.operation.NegOp object at 0x7eff171d8770>
-2683: <buddy.compiler.graph.operation.CatOp object at 0x7eff171d87a0>
-2684: <buddy.compiler.graph.operation.MulOp object at 0x7eff171d87d0>
-2685: <buddy.compiler.graph.operation.AddOp object at 0x7eff171d8800>
-2686: <buddy.compiler.graph.operation.MulOp object at 0x7eff171d8830>
-2687: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171d8860>
-2688: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171d8890>
-2689: <buddy.compiler.graph.operation.NegOp object at 0x7eff171d88c0>
-2690: <buddy.compiler.graph.operation.CatOp object at 0x7eff171d88f0>
-2691: <buddy.compiler.graph.operation.MulOp object at 0x7eff171d8920>
-2692: <buddy.compiler.graph.operation.AddOp object at 0x7eff171d8950>
-2693: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171d8980>
-2694: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff171d89b0>
-2695: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171d89e0>
-2696: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171d8a10>
-2697: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff171d8a40>
-2698: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171d8a70>
-2699: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171d8aa0>
-2700: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171d8ad0>
-2701: <buddy.compiler.graph.operation.SliceOp object at 0x7eff171d8b00>
-2702: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff171d8b30>
-2703: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff171d8b90>
-2704: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171d8b60>
-2705: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8bc0>
-2706: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171d8bf0>
-2707: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8c20>
-2708: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171d8c50>
-2709: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8c80>
-2710: <buddy.compiler.graph.operation.AddOp object at 0x7eff171d8cb0>
-2711: <buddy.compiler.graph.operation.PowOp object at 0x7eff171d8ce0>
-2712: <buddy.compiler.graph.operation.MeanOp object at 0x7eff171d8d10>
-2713: <buddy.compiler.graph.operation.AddOp object at 0x7eff171d8d40>
-2714: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff171d8d70>
-2715: <buddy.compiler.graph.operation.MulOp object at 0x7eff171d8da0>
-2716: <buddy.compiler.graph.operation.MulOp object at 0x7eff171d8dd0>
-2717: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171d8e00>
-2718: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8e30>
-2719: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171d8e60>
-2720: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8e90>
-2721: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff171d8ec0>
-2722: <buddy.compiler.graph.operation.MulOp object at 0x7eff171d8ef0>
-2723: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171d8f20>
-2724: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8f50>
-2725: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171d8f80>
-2726: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d8fb0>
-2727: <buddy.compiler.graph.operation.MulOp object at 0x7eff171d8fe0>
-2728: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff171d9010>
-2729: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d9040>
-2730: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff171d9070>
-2731: <buddy.compiler.graph.operation.ViewOp object at 0x7eff171d90a0>
-2732: <buddy.compiler.graph.operation.AddOp object at 0x7eff171d90d0>
-2733: <buddy.compiler.graph.operation.PowOp object at 0x7eff18239f10>
-2734: <buddy.compiler.graph.operation.MeanOp object at 0x7eff18247e60>
-2735: <buddy.compiler.graph.operation.AddOp object at 0x7eff18247d70>
-2736: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff1831f0e0>
-2737: <buddy.compiler.graph.operation.MulOp object at 0x7eff1831ca70>
-2738: <buddy.compiler.graph.operation.MulOp object at 0x7eff18250980>
-2739: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff1826fd40>
-2740: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18621670>
-2741: <buddy.compiler.graph.operation.MatmulOp object at 0x7f056d780980>
-2742: <buddy.compiler.graph.operation.ViewOp object at 0x7eff185e39e0>
-2743: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff185e00b0>
-2744: <buddy.compiler.graph.operation.ViewOp object at 0x7eff185e3c80>
-2745: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff185e0050>
-2746: <buddy.compiler.graph.operation.ViewOp object at 0x7eff185e03b0>
-2747: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff186d3ef0>
-2748: <buddy.compiler.graph.operation.ViewOp object at 0x7f056d717260>
-2749: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff184f63c0>
-2750: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f6390>
-2751: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f6360>
-2752: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f6330>
-2753: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f6300>
-2754: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f62d0>
-2755: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f62a0>
-2756: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f6270>
-2757: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff184f6240>
-2758: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff184f61e0>
-2759: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f61b0>
-2760: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f6180>
-2761: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f6150>
-2762: <buddy.compiler.graph.operation.NegOp object at 0x7eff184f6120>
-2763: <buddy.compiler.graph.operation.CatOp object at 0x7eff184f60f0>
-2764: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f4260>
-2765: <buddy.compiler.graph.operation.AddOp object at 0x7eff184f6090>
-2766: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f6060>
-2767: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f6030>
-2768: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f5fd0>
-2769: <buddy.compiler.graph.operation.NegOp object at 0x7eff184f5fa0>
-2770: <buddy.compiler.graph.operation.CatOp object at 0x7eff184f5f70>
-2771: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f5f40>
-2772: <buddy.compiler.graph.operation.AddOp object at 0x7eff184f5f10>
-2773: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff184f5ee0>
-2774: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff184f5eb0>
-2775: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f5e80>
-2776: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f5e50>
-2777: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff184f5e20>
-2778: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f5dc0>
-2779: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f5d90>
-2780: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f5d60>
-2781: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f5d30>
-2782: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff184f5d00>
-2783: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff184f5ca0>
-2784: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f5cd0>
-2785: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f5c70>
-2786: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f5c40>
-2787: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f5c10>
-2788: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff184f5be0>
-2789: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f5bb0>
-2790: <buddy.compiler.graph.operation.AddOp object at 0x7eff184f5b80>
-2791: <buddy.compiler.graph.operation.PowOp object at 0x7eff184f5b50>
-2792: <buddy.compiler.graph.operation.MeanOp object at 0x7eff184f5b20>
-2793: <buddy.compiler.graph.operation.AddOp object at 0x7eff184f5af0>
-2794: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff184f5ac0>
-2795: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f5a60>
-2796: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f5a30>
-2797: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f5a00>
-2798: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f59d0>
-2799: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff184f59a0>
-2800: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f5970>
-2801: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff184f5940>
-2802: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f5910>
-2803: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f58e0>
-2804: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f58b0>
-2805: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff184f5880>
-2806: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f5850>
-2807: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f5820>
-2808: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f57f0>
-2809: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f57c0>
-2810: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff184f5760>
-2811: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f5730>
-2812: <buddy.compiler.graph.operation.AddOp object at 0x7eff184f5700>
-2813: <buddy.compiler.graph.operation.PowOp object at 0x7eff184f56d0> // Input RMSNorm
-2814: <buddy.compiler.graph.operation.MeanOp object at 0x7eff184f56a0>
-2815: <buddy.compiler.graph.operation.AddOp object at 0x7eff184f5670>
-2816: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff184f5640>
-2817: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f5610>
-2818: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f55e0>
-2819: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f5580>
-2820: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f5550>
-2821: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff184f5520>
-2822: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f54f0>
-2823: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f54c0>
-2824: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f5460>
-2825: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff184f5430>
-2826: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f5400>
-2827: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f53d0>
-2828: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f53a0>
-2829: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff184f5370>
-2830: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f5340>
-2831: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f5310>
-2832: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f52e0>
-2833: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f5280>
-2834: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f5250>
-2835: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f5220>
-2836: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f51f0>
-2837: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff184f51c0>
-2838: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff184f5190>
-2839: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f5160>
-2840: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f5130>
-2841: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f5100>
-2842: <buddy.compiler.graph.operation.NegOp object at 0x7eff184f50d0>
-2843: <buddy.compiler.graph.operation.CatOp object at 0x7eff184f50a0>
-2844: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f5070>
-2845: <buddy.compiler.graph.operation.AddOp object at 0x7eff184f5040>
-2846: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f5010>
-2847: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f4fe0>
-2848: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f4fb0>
-2849: <buddy.compiler.graph.operation.NegOp object at 0x7eff184f4f80>
-2850: <buddy.compiler.graph.operation.CatOp object at 0x7eff184f4f50>
-2851: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f4f20>
-2852: <buddy.compiler.graph.operation.AddOp object at 0x7eff184f4ef0>
-2853: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff184f4ec0>
-2854: <buddy.compiler.graph.operation.UnsqueezeOp object at 0x7eff184f4e90>
-2855: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f4e60>
-2856: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f4e30>
-2857: <buddy.compiler.graph.operation.ExpandOp object at 0x7eff184f4e00>
-2858: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f4dd0>
-2859: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f4da0>
-2860: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f4d70>
-2861: <buddy.compiler.graph.operation.SliceOp object at 0x7eff184f4d40>
-2862: <buddy.compiler.graph.operation.ScaledDotProductFlashAttentionForCpuOp object at 0x7eff184f4d10>
-2863: <buddy.compiler.graph.operation.GetItemOp object at 0x7eff184f4cb0>
-2864: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f4ce0>
-2865: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f4c80>
-2866: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f4c50>
-2867: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f4c20>
-2868: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff184f4bf0>
-2869: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f4bc0>
-2870: <buddy.compiler.graph.operation.AddOp object at 0x7eff184f4b90>
-2871: <buddy.compiler.graph.operation.PowOp object at 0x7eff184f4b60> RMSNorm
-2872: <buddy.compiler.graph.operation.MeanOp object at 0x7eff184f4b30>
-2873: <buddy.compiler.graph.operation.AddOp object at 0x7eff184f4b00>
-2874: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff184f4ad0>
-2875: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f4aa0>
-2876: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f4a70>
-2877: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f4a40>
-2878: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f4a10>
-2879: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff184f49e0>
-2880: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f49b0>
-2881: <buddy.compiler.graph.operation.SigmoidOp object at 0x7eff184f4980>
-2882: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f4950>
-2883: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f4920>
-2884: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f48f0>
-2885: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff184f4890>
-2886: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f4830>
-2887: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f47a0>
-2888: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff184f46b0>
-2889: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f4650>
-2890: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff184f4590>
-2891: <buddy.compiler.graph.operation.ViewOp object at 0x7eff184f4500>
-2892: <buddy.compiler.graph.operation.AddOp object at 0x7eff184f44a0> // Output
-2893: <buddy.compiler.graph.operation.PowOp object at 0x7eff184f4410>
-2894: <buddy.compiler.graph.operation.MeanOp object at 0x7eff184f4320>
-2895: <buddy.compiler.graph.operation.AddOp object at 0x7eff184f4170>
-2896: <buddy.compiler.graph.operation.RsqrtOp object at 0x7eff184f4230>
-2897: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f4110>
-2898: <buddy.compiler.graph.operation.MulOp object at 0x7eff184f4050>
-2899: <buddy.compiler.graph.operation.SliceOp object at 0x7eff187ae9c0>
-2900: <buddy.compiler.graph.operation.SliceOp object at 0x7eff187af980>
-2901: <buddy.compiler.graph.operation.SliceOp object at 0x7f056d3af7a0>
-2902: <buddy.compiler.graph.operation.PermuteOp object at 0x7eff240ef770>
-2903: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18432030>
-2904: <buddy.compiler.graph.operation.MatmulOp object at 0x7eff18431fa0>
-2905: <buddy.compiler.graph.operation.ViewOp object at 0x7eff18433fb0>
-2906: <buddy.compiler.graph.operation.OutputOp object at 0x7eff18433f50>
diff --git a/examples/BuddyLlama/subgraph.mlir b/examples/BuddyLlama/subgraph.mlir
deleted file mode 100644
index 7fa3d2d7..00000000
--- a/examples/BuddyLlama/subgraph.mlir
+++ /dev/null
@@ -1,4894 +0,0 @@
-#map = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1, d2) -> (d1)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
-#map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-module {
-  func.func @subgraph0(%arg0: tensor<32000x4096xf32>, %arg1: tensor<1x40xi64>, %arg2: tensor<64xf32>, %arg3: tensor<4096xf32>, %arg4: tensor<4096x4096xf32>, %arg5: tensor<4096x4096xf32>, %arg6: tensor<4096x4096xf32>, %arg7: tensor<4096x4096xf32>, %arg8: tensor<4096xf32>, %arg9: tensor<11008x4096xf32>, %arg10: tensor<11008x4096xf32>, %arg11: tensor<4096x11008xf32>, %arg12: tensor<4096xf32>, %arg13: tensor<4096x4096xf32>, %arg14: tensor<4096x4096xf32>, %arg15: tensor<4096x4096xf32>, %arg16: tensor<4096x4096xf32>, %arg17: tensor<4096xf32>, %arg18: tensor<11008x4096xf32>, %arg19: tensor<11008x4096xf32>, %arg20: tensor<4096x11008xf32>, %arg21: tensor<4096xf32>, %arg22: tensor<4096x4096xf32>, %arg23: tensor<4096x4096xf32>, %arg24: tensor<4096x4096xf32>, %arg25: tensor<4096x4096xf32>, %arg26: tensor<4096xf32>, %arg27: tensor<11008x4096xf32>, %arg28: tensor<11008x4096xf32>, %arg29: tensor<4096x11008xf32>, %arg30: tensor<4096xf32>, %arg31: tensor<4096x4096xf32>, %arg32: tensor<4096x4096xf32>, %arg33: tensor<4096x4096xf32>, %arg34: tensor<4096x4096xf32>, %arg35: tensor<4096xf32>, %arg36: tensor<11008x4096xf32>, %arg37: tensor<11008x4096xf32>, %arg38: tensor<4096x11008xf32>, %arg39: tensor<4096xf32>, %arg40: tensor<4096x4096xf32>, %arg41: tensor<4096x4096xf32>, %arg42: tensor<4096x4096xf32>, %arg43: tensor<4096x4096xf32>, %arg44: tensor<4096xf32>, %arg45: tensor<11008x4096xf32>, %arg46: tensor<11008x4096xf32>, %arg47: tensor<4096x11008xf32>, %arg48: tensor<4096xf32>, %arg49: tensor<4096x4096xf32>, %arg50: tensor<4096x4096xf32>, %arg51: tensor<4096x4096xf32>, %arg52: tensor<4096x4096xf32>, %arg53: tensor<4096xf32>, %arg54: tensor<11008x4096xf32>, %arg55: tensor<11008x4096xf32>, %arg56: tensor<4096x11008xf32>, %arg57: tensor<4096xf32>, %arg58: tensor<4096x4096xf32>, %arg59: tensor<4096x4096xf32>, %arg60: tensor<4096x4096xf32>, %arg61: tensor<4096x4096xf32>, %arg62: tensor<4096xf32>, %arg63: tensor<11008x4096xf32>, %arg64: tensor<11008x4096xf32>, %arg65: tensor<4096x11008xf32>, %arg66: tensor<4096xf32>, %arg67: tensor<4096x4096xf32>, %arg68: tensor<4096x4096xf32>, %arg69: tensor<4096x4096xf32>, %arg70: tensor<4096x4096xf32>, %arg71: tensor<4096xf32>, %arg72: tensor<11008x4096xf32>, %arg73: tensor<11008x4096xf32>, %arg74: tensor<4096x11008xf32>, %arg75: tensor<4096xf32>, %arg76: tensor<4096x4096xf32>, %arg77: tensor<4096x4096xf32>, %arg78: tensor<4096x4096xf32>, %arg79: tensor<4096x4096xf32>, %arg80: tensor<4096xf32>, %arg81: tensor<11008x4096xf32>, %arg82: tensor<11008x4096xf32>, %arg83: tensor<4096x11008xf32>, %arg84: tensor<4096xf32>, %arg85: tensor<4096x4096xf32>, %arg86: tensor<4096x4096xf32>, %arg87: tensor<4096x4096xf32>, %arg88: tensor<4096x4096xf32>, %arg89: tensor<4096xf32>, %arg90: tensor<11008x4096xf32>, %arg91: tensor<11008x4096xf32>, %arg92: tensor<4096x11008xf32>, %arg93: tensor<4096xf32>, %arg94: tensor<4096x4096xf32>, %arg95: tensor<4096x4096xf32>, %arg96: tensor<4096x4096xf32>, %arg97: tensor<4096x4096xf32>, %arg98: tensor<4096xf32>, %arg99: tensor<11008x4096xf32>, %arg100: tensor<11008x4096xf32>, %arg101: tensor<4096x11008xf32>, %arg102: tensor<4096xf32>, %arg103: tensor<4096x4096xf32>, %arg104: tensor<4096x4096xf32>, %arg105: tensor<4096x4096xf32>, %arg106: tensor<4096x4096xf32>, %arg107: tensor<4096xf32>, %arg108: tensor<11008x4096xf32>, %arg109: tensor<11008x4096xf32>, %arg110: tensor<4096x11008xf32>, %arg111: tensor<4096xf32>, %arg112: tensor<4096x4096xf32>, %arg113: tensor<4096x4096xf32>, %arg114: tensor<4096x4096xf32>, %arg115: tensor<4096x4096xf32>, %arg116: tensor<4096xf32>, %arg117: tensor<11008x4096xf32>, %arg118: tensor<11008x4096xf32>, %arg119: tensor<4096x11008xf32>, %arg120: tensor<4096xf32>, %arg121: tensor<4096x4096xf32>, %arg122: tensor<4096x4096xf32>, %arg123: tensor<4096x4096xf32>, %arg124: tensor<4096x4096xf32>, %arg125: tensor<4096xf32>, %arg126: tensor<11008x4096xf32>, %arg127: tensor<11008x4096xf32>, %arg128: tensor<4096x11008xf32>, %arg129: tensor<4096xf32>, %arg130: tensor<4096x4096xf32>, %arg131: tensor<4096x4096xf32>, %arg132: tensor<4096x4096xf32>, %arg133: tensor<4096x4096xf32>, %arg134: tensor<4096xf32>, %arg135: tensor<11008x4096xf32>, %arg136: tensor<11008x4096xf32>, %arg137: tensor<4096x11008xf32>, %arg138: tensor<4096xf32>, %arg139: tensor<4096x4096xf32>, %arg140: tensor<4096x4096xf32>, %arg141: tensor<4096x4096xf32>, %arg142: tensor<4096x4096xf32>, %arg143: tensor<4096xf32>, %arg144: tensor<11008x4096xf32>, %arg145: tensor<11008x4096xf32>, %arg146: tensor<4096x11008xf32>, %arg147: tensor<4096xf32>, %arg148: tensor<4096x4096xf32>, %arg149: tensor<4096x4096xf32>, %arg150: tensor<4096x4096xf32>, %arg151: tensor<4096x4096xf32>, %arg152: tensor<4096xf32>, %arg153: tensor<11008x4096xf32>, %arg154: tensor<11008x4096xf32>, %arg155: tensor<4096x11008xf32>, %arg156: tensor<4096xf32>, %arg157: tensor<4096x4096xf32>, %arg158: tensor<4096x4096xf32>, %arg159: tensor<4096x4096xf32>, %arg160: tensor<4096x4096xf32>, %arg161: tensor<4096xf32>, %arg162: tensor<11008x4096xf32>, %arg163: tensor<11008x4096xf32>, %arg164: tensor<4096x11008xf32>, %arg165: tensor<4096xf32>, %arg166: tensor<4096x4096xf32>, %arg167: tensor<4096x4096xf32>, %arg168: tensor<4096x4096xf32>, %arg169: tensor<4096x4096xf32>, %arg170: tensor<4096xf32>, %arg171: tensor<11008x4096xf32>, %arg172: tensor<11008x4096xf32>, %arg173: tensor<4096x11008xf32>, %arg174: tensor<4096xf32>, %arg175: tensor<4096x4096xf32>, %arg176: tensor<4096x4096xf32>, %arg177: tensor<4096x4096xf32>, %arg178: tensor<4096x4096xf32>, %arg179: tensor<4096xf32>, %arg180: tensor<11008x4096xf32>, %arg181: tensor<11008x4096xf32>, %arg182: tensor<4096x11008xf32>, %arg183: tensor<4096xf32>, %arg184: tensor<4096x4096xf32>, %arg185: tensor<4096x4096xf32>, %arg186: tensor<4096x4096xf32>, %arg187: tensor<4096x4096xf32>, %arg188: tensor<4096xf32>, %arg189: tensor<11008x4096xf32>, %arg190: tensor<11008x4096xf32>, %arg191: tensor<4096x11008xf32>, %arg192: tensor<4096xf32>, %arg193: tensor<4096x4096xf32>, %arg194: tensor<4096x4096xf32>, %arg195: tensor<4096x4096xf32>, %arg196: tensor<4096x4096xf32>, %arg197: tensor<4096xf32>, %arg198: tensor<11008x4096xf32>, %arg199: tensor<11008x4096xf32>, %arg200: tensor<4096x11008xf32>, %arg201: tensor<4096xf32>, %arg202: tensor<4096x4096xf32>, %arg203: tensor<4096x4096xf32>, %arg204: tensor<4096x4096xf32>, %arg205: tensor<4096x4096xf32>, %arg206: tensor<4096xf32>, %arg207: tensor<11008x4096xf32>, %arg208: tensor<11008x4096xf32>, %arg209: tensor<4096x11008xf32>, %arg210: tensor<4096xf32>, %arg211: tensor<4096x4096xf32>, %arg212: tensor<4096x4096xf32>, %arg213: tensor<4096x4096xf32>, %arg214: tensor<4096x4096xf32>, %arg215: tensor<4096xf32>, %arg216: tensor<11008x4096xf32>, %arg217: tensor<11008x4096xf32>, %arg218: tensor<4096x11008xf32>, %arg219: tensor<4096xf32>, %arg220: tensor<4096x4096xf32>, %arg221: tensor<4096x4096xf32>, %arg222: tensor<4096x4096xf32>, %arg223: tensor<4096x4096xf32>, %arg224: tensor<4096xf32>, %arg225: tensor<11008x4096xf32>, %arg226: tensor<11008x4096xf32>, %arg227: tensor<4096x11008xf32>, %arg228: tensor<4096xf32>, %arg229: tensor<4096x4096xf32>, %arg230: tensor<4096x4096xf32>, %arg231: tensor<4096x4096xf32>, %arg232: tensor<4096x4096xf32>, %arg233: tensor<4096xf32>, %arg234: tensor<11008x4096xf32>, %arg235: tensor<11008x4096xf32>, %arg236: tensor<4096x11008xf32>, %arg237: tensor<4096xf32>, %arg238: tensor<4096x4096xf32>, %arg239: tensor<4096x4096xf32>, %arg240: tensor<4096x4096xf32>, %arg241: tensor<4096x4096xf32>, %arg242: tensor<4096xf32>, %arg243: tensor<11008x4096xf32>, %arg244: tensor<11008x4096xf32>, %arg245: tensor<4096x11008xf32>, %arg246: tensor<4096xf32>, %arg247: tensor<4096x4096xf32>, %arg248: tensor<4096x4096xf32>, %arg249: tensor<4096x4096xf32>, %arg250: tensor<4096x4096xf32>, %arg251: tensor<4096xf32>, %arg252: tensor<11008x4096xf32>, %arg253: tensor<11008x4096xf32>, %arg254: tensor<4096x11008xf32>, %arg255: tensor<4096xf32>, %arg256: tensor<4096x4096xf32>, %arg257: tensor<4096x4096xf32>, %arg258: tensor<4096x4096xf32>, %arg259: tensor<4096x4096xf32>, %arg260: tensor<4096xf32>, %arg261: tensor<11008x4096xf32>, %arg262: tensor<11008x4096xf32>, %arg263: tensor<4096x11008xf32>, %arg264: tensor<4096xf32>, %arg265: tensor<4096x4096xf32>, %arg266: tensor<4096x4096xf32>, %arg267: tensor<4096x4096xf32>, %arg268: tensor<4096x4096xf32>, %arg269: tensor<4096xf32>, %arg270: tensor<11008x4096xf32>, %arg271: tensor<11008x4096xf32>, %arg272: tensor<4096x11008xf32>, %arg273: tensor<4096xf32>, %arg274: tensor<4096x4096xf32>, %arg275: tensor<4096x4096xf32>, %arg276: tensor<4096x4096xf32>, %arg277: tensor<4096x4096xf32>, %arg278: tensor<4096xf32>, %arg279: tensor<11008x4096xf32>, %arg280: tensor<11008x4096xf32>, %arg281: tensor<4096x11008xf32>, %arg282: tensor<4096xf32>, %arg283: tensor<4096x4096xf32>, %arg284: tensor<4096x4096xf32>, %arg285: tensor<4096x4096xf32>, %arg286: tensor<4096x4096xf32>, %arg287: tensor<4096xf32>, %arg288: tensor<11008x4096xf32>, %arg289: tensor<11008x4096xf32>, %arg290: tensor<4096x11008xf32>, %arg291: tensor<4096xf32>, %arg292: tensor<32000x4096xf32>) -> tensor<1x40x32000xf32> {
-    %0 = tosa.cast %arg1 : (tensor<1x40xi64>) -> tensor<1x40xi32>
-    %1 = tosa.reshape %arg0 {new_shape = array<i64: 1, 32000, 4096>} : (tensor<32000x4096xf32>) -> tensor<1x32000x4096xf32>
-    %2 = tosa.gather %1, %0 : (tensor<1x32000x4096xf32>, tensor<1x40xi32>) -> tensor<1x40x4096xf32>
-    %3 = tosa.reshape %2 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %4 = "tosa.const"() <{value = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]> : tensor<40xi64>}> : () -> tensor<40xi64>
-    %5 = tosa.reshape %4 {new_shape = array<i64: 1, 40>} : (tensor<40xi64>) -> tensor<1x40xi64>
-    %cst = arith.constant dense<-3.40282347E+38> : tensor<40x41xf32>
-    %6 = "tosa.const"() <{value = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]> : tensor<41xi64>}> : () -> tensor<41xi64>
-    %7 = tosa.reshape %6 {new_shape = array<i64: 1, 41>} : (tensor<41xi64>) -> tensor<1x41xi64>
-    %8 = "tosa.const"() <{value = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]> : tensor<40xi64>}> : () -> tensor<40xi64>
-    %9 = tosa.reshape %8 {new_shape = array<i64: 40, 1>} : (tensor<40xi64>) -> tensor<40x1xi64>
-    %10 = tosa.sub %7, %9 : (tensor<1x41xi64>, tensor<40x1xi64>) -> tensor<40x41xi64>
-    %c1_i64 = arith.constant 1 : i64
-    %splat = tensor.splat %c1_i64 : tensor<40x41xi64>
-    %11 = arith.cmpi sge, %10, %splat : tensor<40x41xi64>
-    %cst_0 = arith.constant 0.000000e+00 : f32
-    %12 = tensor.empty() : tensor<40x41xf32>
-    %splat_1 = tensor.splat %cst_0 : tensor<40x41xf32>
-    %13 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%11, %cst, %splat_1 : tensor<40x41xi1>, tensor<40x41xf32>, tensor<40x41xf32>) outs(%12 : tensor<40x41xf32>) {
-    ^bb0(%in: i1, %in_873: f32, %in_874: f32, %out: f32):
-      %3745 = arith.select %in, %in_873, %in_874 : f32
-      linalg.yield %3745 : f32
-    } -> tensor<40x41xf32>
-    %14 = "tosa.const"() <{value = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]> : tensor<41xi64>}> : () -> tensor<41xi64>
-    %15 = tosa.reshape %4 {new_shape = array<i64: 40, 1>} : (tensor<40xi64>) -> tensor<40x1xi64>
-    %16 = tensor.empty() : tensor<40x41xi1>
-    %17 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"]} ins(%14, %15 : tensor<41xi64>, tensor<40x1xi64>) outs(%16 : tensor<40x41xi1>) {
-    ^bb0(%in: i64, %in_873: i64, %out: i1):
-      %3745 = arith.cmpi sgt, %in, %in_873 : i64
-      linalg.yield %3745 : i1
-    } -> tensor<40x41xi1>
-    %18 = tosa.cast %17 : (tensor<40x41xi1>) -> tensor<40x41xf32>
-    %19 = tosa.mul %13, %18 {shift = 0 : i8} : (tensor<40x41xf32>, tensor<40x41xf32>) -> tensor<40x41xf32> // *******
-    %20 = tosa.reshape %arg2 {new_shape = array<i64: 1, 64>} : (tensor<64xf32>) -> tensor<1x64xf32>
-    %extracted_slice = tensor.extract_slice %20[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> to tensor<1x64xf32>
-    %21 = tosa.reshape %extracted_slice {new_shape = array<i64: 1, 64, 1>} : (tensor<1x64xf32>) -> tensor<1x64x1xf32>
-    %22 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x64x1xf32>}> : () -> tensor<1x64x1xf32>
-    %23 = tosa.add %21, %22 : (tensor<1x64x1xf32>, tensor<1x64x1xf32>) -> tensor<1x64x1xf32>
-    %extracted_slice_2 = tensor.extract_slice %5[0, 0] [1, 40] [1, 1] : tensor<1x40xi64> to tensor<1x40xi64>
-    %24 = tosa.reshape %extracted_slice_2 {new_shape = array<i64: 1, 1, 40>} : (tensor<1x40xi64>) -> tensor<1x1x40xi64>
-    %extracted_slice_3 = tensor.extract_slice %24[0, 0, 0] [1, 1, 40] [1, 1, 1] : tensor<1x1x40xi64> to tensor<1x1x40xi64>
-    %25 = tosa.cast %extracted_slice_3 : (tensor<1x1x40xi64>) -> tensor<1x1x40xf32>
-    %26 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x64x1xf32>}> : () -> tensor<1x64x1xf32>
-    %27 = tosa.add %23, %26 : (tensor<1x64x1xf32>, tensor<1x64x1xf32>) -> tensor<1x64x1xf32>
-    %28 = tosa.reshape %27 {new_shape = array<i64: 1, 64, 1>} : (tensor<1x64x1xf32>) -> tensor<1x64x1xf32>
-    %29 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40xf32>}> : () -> tensor<1x1x40xf32>
-    %30 = tosa.add %25, %29 : (tensor<1x1x40xf32>, tensor<1x1x40xf32>) -> tensor<1x1x40xf32>
-    %31 = tosa.reshape %30 {new_shape = array<i64: 1, 1, 40>} : (tensor<1x1x40xf32>) -> tensor<1x1x40xf32>
-    %32 = tosa.matmul %28, %31 : (tensor<1x64x1xf32>, tensor<1x1x40xf32>) -> tensor<1x64x40xf32>
-    %33 = tosa.reshape %32 {new_shape = array<i64: 1, 64, 40>} : (tensor<1x64x40xf32>) -> tensor<1x64x40xf32>
-    %34 = "tosa.const"() <{value = dense<[0, 2, 1]> : tensor<3xi32>}> : () -> tensor<3xi32>
-    %35 = tosa.transpose %33, %34 : (tensor<1x64x40xf32>, tensor<3xi32>) -> tensor<1x40x64xf32>
-    %36 = tosa.reshape %35 {new_shape = array<i64: 1, 40, 1, 64>} : (tensor<1x40x64xf32>) -> tensor<1x40x1x64xf32>
-    %37 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x40x2x64xf32>}> : () -> tensor<1x40x2x64xf32>
-    %38 = tosa.add %36, %37 : (tensor<1x40x1x64xf32>, tensor<1x40x2x64xf32>) -> tensor<1x40x2x64xf32>
-    %39 = tosa.identity %38 : (tensor<1x40x2x64xf32>) -> tensor<1x40x2x64xf32>
-    %40 = tosa.reshape %39 {new_shape = array<i64: 1, 40, 128>} : (tensor<1x40x2x64xf32>) -> tensor<1x40x128xf32>
-    %41 = tosa.identity %40 : (tensor<1x40x128xf32>) -> tensor<1x40x128xf32>
-    %42 = math.cos %41 : tensor<1x40x128xf32>
-    %43 = math.sin %41 : tensor<1x40x128xf32>
-    %cst_4 = arith.constant dense<1.000000e+00> : tensor<1xf32>
-    %44 = tosa.reshape %cst_4 {new_shape = array<i64: 1, 1, 1>} : (tensor<1xf32>) -> tensor<1x1x1xf32>
-    %45 = tosa.mul %42, %44 {shift = 0 : i8} : (tensor<1x40x128xf32>, tensor<1x1x1xf32>) -> tensor<1x40x128xf32>  // ***************
-    %cst_5 = arith.constant dense<1.000000e+00> : tensor<1xf32>
-    %46 = tosa.reshape %cst_5 {new_shape = array<i64: 1, 1, 1>} : (tensor<1xf32>) -> tensor<1x1x1xf32>
-    %47 = tosa.mul %43, %46 {shift = 0 : i8} : (tensor<1x40x128xf32>, tensor<1x1x1xf32>) -> tensor<1x40x128xf32>  // ***************
-    %48 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32 = arith.constant 2 : i32
-    %49 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<1x40x4096xf32>) outs(%48 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %50 = tosa.reduce_sum %49 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %51 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %52 = tosa.reciprocal %51 : (tensor<1xf32>) -> tensor<1xf32>
-    %53 = tosa.mul %52, %50 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %54 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %55 = tosa.add %53, %54 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %56 = tosa.rsqrt %55 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %57 = tosa.mul %3, %56 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %58 = tosa.reshape %arg3 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %59 = tosa.mul %58, %57 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %60 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %61 = tosa.transpose %arg4, %60 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %62 = tosa.reshape %59 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_6 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %63 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%62, %61 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_6 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %64 = tosa.reshape %63 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %65 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %66 = tosa.transpose %arg5, %65 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %67 = tosa.reshape %59 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_7 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %68 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%67, %66 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_7 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %69 = tosa.reshape %68 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %70 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %71 = tosa.transpose %arg6, %70 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %72 = tosa.reshape %59 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_8 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %73 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%72, %71 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_8 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %74 = tosa.reshape %73 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %75 = tosa.reshape %64 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %76 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %77 = tosa.transpose %75, %76 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %78 = tosa.reshape %69 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %79 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %80 = tosa.transpose %78, %79 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %81 = tosa.reshape %74 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %82 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %83 = tosa.transpose %81, %82 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %84 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %85 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %86 = tosa.mul %77, %84 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_9 = tensor.extract_slice %77[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_10 = tensor.extract_slice %77[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %87 = tensor.empty() : tensor<1x32x40x64xf32>
-    %88 = linalg.negf ins(%extracted_slice_10 : tensor<1x32x40x64xf32>) outs(%87 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %89 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice = tensor.insert_slice %88 into %89[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_11 = tensor.insert_slice %extracted_slice_9 into %inserted_slice[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %90 = tosa.mul %inserted_slice_11, %85 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %91 = tosa.add %86, %90 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %92 = tosa.mul %80, %84 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_12 = tensor.extract_slice %80[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_13 = tensor.extract_slice %80[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %93 = tensor.empty() : tensor<1x32x40x64xf32>
-    %94 = linalg.negf ins(%extracted_slice_13 : tensor<1x32x40x64xf32>) outs(%93 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %95 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_14 = tensor.insert_slice %94 into %95[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_15 = tensor.insert_slice %extracted_slice_12 into %inserted_slice_14[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %96 = tosa.mul %inserted_slice_15, %85 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %97 = tosa.add %92, %96 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %98 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %99 = tosa.reshape %98 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_16 = tensor.extract_slice %99[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_17 = tensor.extract_slice %extracted_slice_16[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %100 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %101 = tosa.add %extracted_slice_17, %100 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_18 = tensor.extract_slice %101[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_19 = tensor.extract_slice %extracted_slice_18[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_20 = tensor.extract_slice %extracted_slice_19[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_21 = tensor.extract_slice %extracted_slice_20[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_22 = arith.constant 0.000000e+00 : f32
-    %splat_23 = tensor.splat %cst_22 : tensor<40x40xf32>
-    %102 = tosa.reshape %extracted_slice_21 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %103 = tosa.add %splat_23, %102 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %104 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %105 = tosa.transpose %97, %104 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %106 = tosa.reshape %91 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %107 = tosa.reshape %105 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %108 = tosa.matmul %106, %107 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_24 = arith.constant 0.0883883461 : f32
-    %splat_25 = tensor.splat %cst_24 : tensor<32x40x40xf32>
-    %109 = tosa.mul %108, %splat_25 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %110 = tosa.add %109, %103 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %111 = tosa.reduce_max %110 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %112 = tosa.sub %110, %111 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %113 = math.exp %112 : tensor<32x40x40xf32>
-    %114 = tosa.reduce_sum %113 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %115 = tosa.log %114 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %116 = tosa.add %111, %115 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %117 = tosa.sub %110, %116 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %118 = math.exp %117 : tensor<32x40x40xf32>
-    %119 = tosa.reshape %116 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %120 = tosa.reshape %83 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %121 = tosa.matmul %118, %120 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %130 = tosa.reshape %129 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %131 = tosa.add %3, %130 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %132 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_27 = arith.constant 2 : i32
-    %133 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%131 : tensor<1x40x4096xf32>) outs(%132 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_27 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %134 = tosa.reduce_sum %133 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %135 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %136 = tosa.reciprocal %135 : (tensor<1xf32>) -> tensor<1xf32>
-    %137 = tosa.mul %136, %134 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %138 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %139 = tosa.add %137, %138 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %140 = tosa.rsqrt %139 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %141 = tosa.mul %131, %140 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %142 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %143 = tosa.mul %142, %141 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %144 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %145 = tosa.transpose %arg9, %144 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %146 = tosa.reshape %143 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_28 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %147 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%146, %145 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_28 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %148 = tosa.reshape %147 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %149 = tosa.sigmoid %148 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %150 = tosa.mul %148, %149 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %151 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %152 = tosa.transpose %arg10, %151 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %153 = tosa.reshape %143 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_29 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %154 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%153, %152 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_29 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %155 = tosa.reshape %154 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %156 = tosa.mul %150, %155 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %157 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %158 = tosa.transpose %arg11, %157 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %159 = tosa.reshape %156 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_30 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %160 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%159, %158 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_30 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %161 = tosa.reshape %160 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %162 = tosa.add %131, %161 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %163 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_31 = arith.constant 2 : i32
-    %164 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%162 : tensor<1x40x4096xf32>) outs(%163 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_31 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %165 = tosa.reduce_sum %164 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %166 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %167 = tosa.reciprocal %166 : (tensor<1xf32>) -> tensor<1xf32>
-    %168 = tosa.mul %167, %165 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %169 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %170 = tosa.add %168, %169 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %171 = tosa.rsqrt %170 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %172 = tosa.mul %162, %171 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %173 = tosa.reshape %arg12 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %174 = tosa.mul %173, %172 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %175 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %176 = tosa.transpose %arg13, %175 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %177 = tosa.reshape %174 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_32 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %178 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%177, %176 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_32 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %179 = tosa.reshape %178 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %180 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %181 = tosa.transpose %arg14, %180 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %182 = tosa.reshape %174 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_33 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %183 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%182, %181 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_33 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %184 = tosa.reshape %183 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %185 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %186 = tosa.transpose %arg15, %185 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %187 = tosa.reshape %174 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_34 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %188 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%187, %186 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_34 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %189 = tosa.reshape %188 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %190 = tosa.reshape %179 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %191 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %192 = tosa.transpose %190, %191 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %193 = tosa.reshape %184 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %194 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %195 = tosa.transpose %193, %194 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %196 = tosa.reshape %189 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %197 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %198 = tosa.transpose %196, %197 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %199 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %200 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %201 = tosa.mul %192, %199 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_35 = tensor.extract_slice %192[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_36 = tensor.extract_slice %192[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %202 = tensor.empty() : tensor<1x32x40x64xf32>
-    %203 = linalg.negf ins(%extracted_slice_36 : tensor<1x32x40x64xf32>) outs(%202 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %204 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_37 = tensor.insert_slice %203 into %204[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_38 = tensor.insert_slice %extracted_slice_35 into %inserted_slice_37[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %205 = tosa.mul %inserted_slice_38, %200 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %206 = tosa.add %201, %205 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %207 = tosa.mul %195, %199 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_39 = tensor.extract_slice %195[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_40 = tensor.extract_slice %195[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %208 = tensor.empty() : tensor<1x32x40x64xf32>
-    %209 = linalg.negf ins(%extracted_slice_40 : tensor<1x32x40x64xf32>) outs(%208 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %210 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_41 = tensor.insert_slice %209 into %210[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_42 = tensor.insert_slice %extracted_slice_39 into %inserted_slice_41[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %211 = tosa.mul %inserted_slice_42, %200 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %212 = tosa.add %207, %211 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %213 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %214 = tosa.reshape %213 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_43 = tensor.extract_slice %214[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_44 = tensor.extract_slice %extracted_slice_43[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %215 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %216 = tosa.add %extracted_slice_44, %215 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_45 = tensor.extract_slice %216[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_46 = tensor.extract_slice %extracted_slice_45[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_47 = tensor.extract_slice %extracted_slice_46[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_48 = tensor.extract_slice %extracted_slice_47[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_49 = arith.constant 0.000000e+00 : f32
-    %splat_50 = tensor.splat %cst_49 : tensor<40x40xf32>
-    %217 = tosa.reshape %extracted_slice_48 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %218 = tosa.add %splat_50, %217 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %219 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %220 = tosa.transpose %212, %219 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %221 = tosa.reshape %206 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %222 = tosa.reshape %220 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %223 = tosa.matmul %221, %222 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_51 = arith.constant 0.0883883461 : f32
-    %splat_52 = tensor.splat %cst_51 : tensor<32x40x40xf32>
-    %224 = tosa.mul %223, %splat_52 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %225 = tosa.add %224, %218 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %226 = tosa.reduce_max %225 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %227 = tosa.sub %225, %226 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %228 = math.exp %227 : tensor<32x40x40xf32>
-    %229 = tosa.reduce_sum %228 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %230 = tosa.log %229 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %231 = tosa.add %226, %230 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %232 = tosa.sub %225, %231 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %233 = math.exp %232 : tensor<32x40x40xf32>
-    %234 = tosa.reshape %231 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %235 = tosa.reshape %198 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %236 = tosa.matmul %233, %235 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %237 = tosa.reshape %236 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %238 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %239 = tosa.transpose %237, %238 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %240 = tosa.reshape %239 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %241 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %242 = tosa.transpose %arg16, %241 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %243 = tosa.reshape %240 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_53 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %244 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%243, %242 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_53 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %245 = tosa.reshape %244 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %246 = tosa.add %162, %245 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %247 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_54 = arith.constant 2 : i32
-    %248 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%246 : tensor<1x40x4096xf32>) outs(%247 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_54 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %249 = tosa.reduce_sum %248 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %250 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %251 = tosa.reciprocal %250 : (tensor<1xf32>) -> tensor<1xf32>
-    %252 = tosa.mul %251, %249 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %253 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %254 = tosa.add %252, %253 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %255 = tosa.rsqrt %254 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %256 = tosa.mul %246, %255 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %257 = tosa.reshape %arg17 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %258 = tosa.mul %257, %256 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %259 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %260 = tosa.transpose %arg18, %259 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %261 = tosa.reshape %258 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_55 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %262 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%261, %260 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_55 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %263 = tosa.reshape %262 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %264 = tosa.sigmoid %263 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %265 = tosa.mul %263, %264 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %266 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %267 = tosa.transpose %arg19, %266 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %268 = tosa.reshape %258 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_56 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %269 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%268, %267 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_56 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %270 = tosa.reshape %269 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %271 = tosa.mul %265, %270 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %272 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %273 = tosa.transpose %arg20, %272 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %274 = tosa.reshape %271 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_57 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %275 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%274, %273 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_57 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %276 = tosa.reshape %275 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %277 = tosa.add %246, %276 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %278 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_58 = arith.constant 2 : i32
-    %279 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%277 : tensor<1x40x4096xf32>) outs(%278 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_58 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %280 = tosa.reduce_sum %279 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %281 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %282 = tosa.reciprocal %281 : (tensor<1xf32>) -> tensor<1xf32>
-    %283 = tosa.mul %282, %280 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %284 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %285 = tosa.add %283, %284 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %286 = tosa.rsqrt %285 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %287 = tosa.mul %277, %286 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %288 = tosa.reshape %arg21 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %289 = tosa.mul %288, %287 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %290 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %291 = tosa.transpose %arg22, %290 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %292 = tosa.reshape %289 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_59 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %293 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%292, %291 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_59 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %294 = tosa.reshape %293 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %295 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %296 = tosa.transpose %arg23, %295 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %297 = tosa.reshape %289 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_60 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %298 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%297, %296 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_60 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %299 = tosa.reshape %298 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %300 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %301 = tosa.transpose %arg24, %300 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %302 = tosa.reshape %289 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_61 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %303 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%302, %301 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_61 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %304 = tosa.reshape %303 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %305 = tosa.reshape %294 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %306 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %307 = tosa.transpose %305, %306 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %308 = tosa.reshape %299 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %309 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %310 = tosa.transpose %308, %309 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %311 = tosa.reshape %304 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %312 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %313 = tosa.transpose %311, %312 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %314 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %315 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %316 = tosa.mul %307, %314 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_62 = tensor.extract_slice %307[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_63 = tensor.extract_slice %307[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %317 = tensor.empty() : tensor<1x32x40x64xf32>
-    %318 = linalg.negf ins(%extracted_slice_63 : tensor<1x32x40x64xf32>) outs(%317 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %319 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_64 = tensor.insert_slice %318 into %319[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_65 = tensor.insert_slice %extracted_slice_62 into %inserted_slice_64[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %320 = tosa.mul %inserted_slice_65, %315 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %321 = tosa.add %316, %320 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %322 = tosa.mul %310, %314 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_66 = tensor.extract_slice %310[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_67 = tensor.extract_slice %310[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %323 = tensor.empty() : tensor<1x32x40x64xf32>
-    %324 = linalg.negf ins(%extracted_slice_67 : tensor<1x32x40x64xf32>) outs(%323 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %325 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_68 = tensor.insert_slice %324 into %325[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_69 = tensor.insert_slice %extracted_slice_66 into %inserted_slice_68[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %326 = tosa.mul %inserted_slice_69, %315 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %327 = tosa.add %322, %326 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %328 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %329 = tosa.reshape %328 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_70 = tensor.extract_slice %329[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_71 = tensor.extract_slice %extracted_slice_70[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %330 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %331 = tosa.add %extracted_slice_71, %330 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_72 = tensor.extract_slice %331[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_73 = tensor.extract_slice %extracted_slice_72[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_74 = tensor.extract_slice %extracted_slice_73[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_75 = tensor.extract_slice %extracted_slice_74[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_76 = arith.constant 0.000000e+00 : f32
-    %splat_77 = tensor.splat %cst_76 : tensor<40x40xf32>
-    %332 = tosa.reshape %extracted_slice_75 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %333 = tosa.add %splat_77, %332 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %334 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %335 = tosa.transpose %327, %334 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %336 = tosa.reshape %321 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %337 = tosa.reshape %335 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %338 = tosa.matmul %336, %337 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_78 = arith.constant 0.0883883461 : f32
-    %splat_79 = tensor.splat %cst_78 : tensor<32x40x40xf32>
-    %339 = tosa.mul %338, %splat_79 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %340 = tosa.add %339, %333 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %341 = tosa.reduce_max %340 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %342 = tosa.sub %340, %341 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %343 = math.exp %342 : tensor<32x40x40xf32>
-    %344 = tosa.reduce_sum %343 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %345 = tosa.log %344 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %346 = tosa.add %341, %345 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %347 = tosa.sub %340, %346 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %348 = math.exp %347 : tensor<32x40x40xf32>
-    %349 = tosa.reshape %346 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %350 = tosa.reshape %313 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %351 = tosa.matmul %348, %350 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %352 = tosa.reshape %351 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %353 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %354 = tosa.transpose %352, %353 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %355 = tosa.reshape %354 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %356 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %357 = tosa.transpose %arg25, %356 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %358 = tosa.reshape %355 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_80 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %359 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%358, %357 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_80 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %360 = tosa.reshape %359 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %361 = tosa.add %277, %360 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %362 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_81 = arith.constant 2 : i32
-    %363 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%361 : tensor<1x40x4096xf32>) outs(%362 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_81 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %364 = tosa.reduce_sum %363 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %365 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %366 = tosa.reciprocal %365 : (tensor<1xf32>) -> tensor<1xf32>
-    %367 = tosa.mul %366, %364 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %368 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %369 = tosa.add %367, %368 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %370 = tosa.rsqrt %369 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %371 = tosa.mul %361, %370 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %372 = tosa.reshape %arg26 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %373 = tosa.mul %372, %371 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %374 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %375 = tosa.transpose %arg27, %374 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %376 = tosa.reshape %373 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_82 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %377 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%376, %375 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_82 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %378 = tosa.reshape %377 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %379 = tosa.sigmoid %378 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %380 = tosa.mul %378, %379 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %381 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %382 = tosa.transpose %arg28, %381 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %383 = tosa.reshape %373 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_83 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %384 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%383, %382 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_83 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %385 = tosa.reshape %384 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %386 = tosa.mul %380, %385 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %387 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %388 = tosa.transpose %arg29, %387 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %389 = tosa.reshape %386 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_84 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %390 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%389, %388 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_84 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %391 = tosa.reshape %390 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %392 = tosa.add %361, %391 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %393 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_85 = arith.constant 2 : i32
-    %394 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%392 : tensor<1x40x4096xf32>) outs(%393 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_85 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %395 = tosa.reduce_sum %394 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %396 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %397 = tosa.reciprocal %396 : (tensor<1xf32>) -> tensor<1xf32>
-    %398 = tosa.mul %397, %395 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %399 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %400 = tosa.add %398, %399 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %401 = tosa.rsqrt %400 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %402 = tosa.mul %392, %401 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %403 = tosa.reshape %arg30 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %404 = tosa.mul %403, %402 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %405 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %406 = tosa.transpose %arg31, %405 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %407 = tosa.reshape %404 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_86 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %408 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%407, %406 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_86 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %409 = tosa.reshape %408 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %410 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %411 = tosa.transpose %arg32, %410 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %412 = tosa.reshape %404 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_87 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %413 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%412, %411 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_87 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %414 = tosa.reshape %413 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %415 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %416 = tosa.transpose %arg33, %415 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %417 = tosa.reshape %404 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_88 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %418 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%417, %416 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_88 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %419 = tosa.reshape %418 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %420 = tosa.reshape %409 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %421 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %422 = tosa.transpose %420, %421 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %423 = tosa.reshape %414 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %424 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %425 = tosa.transpose %423, %424 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %426 = tosa.reshape %419 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %427 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %428 = tosa.transpose %426, %427 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %429 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %430 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %431 = tosa.mul %422, %429 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_89 = tensor.extract_slice %422[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_90 = tensor.extract_slice %422[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %432 = tensor.empty() : tensor<1x32x40x64xf32>
-    %433 = linalg.negf ins(%extracted_slice_90 : tensor<1x32x40x64xf32>) outs(%432 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %434 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_91 = tensor.insert_slice %433 into %434[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_92 = tensor.insert_slice %extracted_slice_89 into %inserted_slice_91[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %435 = tosa.mul %inserted_slice_92, %430 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %436 = tosa.add %431, %435 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %437 = tosa.mul %425, %429 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_93 = tensor.extract_slice %425[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_94 = tensor.extract_slice %425[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %438 = tensor.empty() : tensor<1x32x40x64xf32>
-    %439 = linalg.negf ins(%extracted_slice_94 : tensor<1x32x40x64xf32>) outs(%438 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %440 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_95 = tensor.insert_slice %439 into %440[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_96 = tensor.insert_slice %extracted_slice_93 into %inserted_slice_95[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %441 = tosa.mul %inserted_slice_96, %430 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %442 = tosa.add %437, %441 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %443 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %444 = tosa.reshape %443 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_97 = tensor.extract_slice %444[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_98 = tensor.extract_slice %extracted_slice_97[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %445 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %446 = tosa.add %extracted_slice_98, %445 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_99 = tensor.extract_slice %446[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_100 = tensor.extract_slice %extracted_slice_99[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_101 = tensor.extract_slice %extracted_slice_100[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_102 = tensor.extract_slice %extracted_slice_101[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_103 = arith.constant 0.000000e+00 : f32
-    %splat_104 = tensor.splat %cst_103 : tensor<40x40xf32>
-    %447 = tosa.reshape %extracted_slice_102 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %448 = tosa.add %splat_104, %447 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %449 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %450 = tosa.transpose %442, %449 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %451 = tosa.reshape %436 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %452 = tosa.reshape %450 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %453 = tosa.matmul %451, %452 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_105 = arith.constant 0.0883883461 : f32
-    %splat_106 = tensor.splat %cst_105 : tensor<32x40x40xf32>
-    %454 = tosa.mul %453, %splat_106 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %455 = tosa.add %454, %448 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %456 = tosa.reduce_max %455 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %457 = tosa.sub %455, %456 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %458 = math.exp %457 : tensor<32x40x40xf32>
-    %459 = tosa.reduce_sum %458 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %460 = tosa.log %459 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %461 = tosa.add %456, %460 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %462 = tosa.sub %455, %461 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %463 = math.exp %462 : tensor<32x40x40xf32>
-    %464 = tosa.reshape %461 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %465 = tosa.reshape %428 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %466 = tosa.matmul %463, %465 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %467 = tosa.reshape %466 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %468 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %469 = tosa.transpose %467, %468 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %470 = tosa.reshape %469 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %471 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %472 = tosa.transpose %arg34, %471 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %473 = tosa.reshape %470 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_107 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %474 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%473, %472 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_107 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %475 = tosa.reshape %474 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %476 = tosa.add %392, %475 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %477 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_108 = arith.constant 2 : i32
-    %478 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%476 : tensor<1x40x4096xf32>) outs(%477 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_108 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %479 = tosa.reduce_sum %478 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %480 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %481 = tosa.reciprocal %480 : (tensor<1xf32>) -> tensor<1xf32>
-    %482 = tosa.mul %481, %479 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %483 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %484 = tosa.add %482, %483 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %485 = tosa.rsqrt %484 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %486 = tosa.mul %476, %485 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %487 = tosa.reshape %arg35 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %488 = tosa.mul %487, %486 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %489 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %490 = tosa.transpose %arg36, %489 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %491 = tosa.reshape %488 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_109 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %492 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%491, %490 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_109 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %493 = tosa.reshape %492 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %494 = tosa.sigmoid %493 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %495 = tosa.mul %493, %494 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %496 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %497 = tosa.transpose %arg37, %496 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %498 = tosa.reshape %488 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_110 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %499 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%498, %497 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_110 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %500 = tosa.reshape %499 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %501 = tosa.mul %495, %500 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %502 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %503 = tosa.transpose %arg38, %502 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %504 = tosa.reshape %501 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_111 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %505 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%504, %503 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_111 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %506 = tosa.reshape %505 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %507 = tosa.add %476, %506 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %508 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_112 = arith.constant 2 : i32
-    %509 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%507 : tensor<1x40x4096xf32>) outs(%508 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_112 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %510 = tosa.reduce_sum %509 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %511 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %512 = tosa.reciprocal %511 : (tensor<1xf32>) -> tensor<1xf32>
-    %513 = tosa.mul %512, %510 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %514 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %515 = tosa.add %513, %514 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %516 = tosa.rsqrt %515 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %517 = tosa.mul %507, %516 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %518 = tosa.reshape %arg39 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %519 = tosa.mul %518, %517 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %520 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %521 = tosa.transpose %arg40, %520 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %522 = tosa.reshape %519 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_113 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %523 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%522, %521 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_113 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %524 = tosa.reshape %523 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %525 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %526 = tosa.transpose %arg41, %525 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %527 = tosa.reshape %519 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_114 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %528 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%527, %526 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_114 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %529 = tosa.reshape %528 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %530 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %531 = tosa.transpose %arg42, %530 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %532 = tosa.reshape %519 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_115 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %533 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%532, %531 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_115 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %534 = tosa.reshape %533 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %535 = tosa.reshape %524 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %536 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %537 = tosa.transpose %535, %536 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %538 = tosa.reshape %529 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %539 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %540 = tosa.transpose %538, %539 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %541 = tosa.reshape %534 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %542 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %543 = tosa.transpose %541, %542 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %544 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %545 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %546 = tosa.mul %537, %544 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_116 = tensor.extract_slice %537[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_117 = tensor.extract_slice %537[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %547 = tensor.empty() : tensor<1x32x40x64xf32>
-    %548 = linalg.negf ins(%extracted_slice_117 : tensor<1x32x40x64xf32>) outs(%547 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %549 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_118 = tensor.insert_slice %548 into %549[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_119 = tensor.insert_slice %extracted_slice_116 into %inserted_slice_118[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %550 = tosa.mul %inserted_slice_119, %545 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %551 = tosa.add %546, %550 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %552 = tosa.mul %540, %544 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_120 = tensor.extract_slice %540[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_121 = tensor.extract_slice %540[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %553 = tensor.empty() : tensor<1x32x40x64xf32>
-    %554 = linalg.negf ins(%extracted_slice_121 : tensor<1x32x40x64xf32>) outs(%553 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %555 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_122 = tensor.insert_slice %554 into %555[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_123 = tensor.insert_slice %extracted_slice_120 into %inserted_slice_122[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %556 = tosa.mul %inserted_slice_123, %545 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %557 = tosa.add %552, %556 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %558 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %559 = tosa.reshape %558 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_124 = tensor.extract_slice %559[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_125 = tensor.extract_slice %extracted_slice_124[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %560 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %561 = tosa.add %extracted_slice_125, %560 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_126 = tensor.extract_slice %561[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_127 = tensor.extract_slice %extracted_slice_126[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_128 = tensor.extract_slice %extracted_slice_127[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_129 = tensor.extract_slice %extracted_slice_128[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_130 = arith.constant 0.000000e+00 : f32
-    %splat_131 = tensor.splat %cst_130 : tensor<40x40xf32>
-    %562 = tosa.reshape %extracted_slice_129 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %563 = tosa.add %splat_131, %562 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %564 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %565 = tosa.transpose %557, %564 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %566 = tosa.reshape %551 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %567 = tosa.reshape %565 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %568 = tosa.matmul %566, %567 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_132 = arith.constant 0.0883883461 : f32
-    %splat_133 = tensor.splat %cst_132 : tensor<32x40x40xf32>
-    %569 = tosa.mul %568, %splat_133 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %570 = tosa.add %569, %563 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %571 = tosa.reduce_max %570 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %572 = tosa.sub %570, %571 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %573 = math.exp %572 : tensor<32x40x40xf32>
-    %574 = tosa.reduce_sum %573 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %575 = tosa.log %574 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %576 = tosa.add %571, %575 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %577 = tosa.sub %570, %576 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %578 = math.exp %577 : tensor<32x40x40xf32>
-    %579 = tosa.reshape %576 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %580 = tosa.reshape %543 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %581 = tosa.matmul %578, %580 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %582 = tosa.reshape %581 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %583 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %584 = tosa.transpose %582, %583 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %585 = tosa.reshape %584 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %586 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %587 = tosa.transpose %arg43, %586 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %588 = tosa.reshape %585 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_134 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %589 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%588, %587 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_134 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %590 = tosa.reshape %589 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %591 = tosa.add %507, %590 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %592 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_135 = arith.constant 2 : i32
-    %593 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%591 : tensor<1x40x4096xf32>) outs(%592 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_135 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %594 = tosa.reduce_sum %593 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %595 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %596 = tosa.reciprocal %595 : (tensor<1xf32>) -> tensor<1xf32>
-    %597 = tosa.mul %596, %594 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %598 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %599 = tosa.add %597, %598 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %600 = tosa.rsqrt %599 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %601 = tosa.mul %591, %600 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %602 = tosa.reshape %arg44 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %603 = tosa.mul %602, %601 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %604 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %605 = tosa.transpose %arg45, %604 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %606 = tosa.reshape %603 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_136 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %607 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%606, %605 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_136 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %608 = tosa.reshape %607 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %609 = tosa.sigmoid %608 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %610 = tosa.mul %608, %609 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %611 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %612 = tosa.transpose %arg46, %611 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %613 = tosa.reshape %603 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_137 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %614 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%613, %612 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_137 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %615 = tosa.reshape %614 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %616 = tosa.mul %610, %615 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %617 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %618 = tosa.transpose %arg47, %617 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %619 = tosa.reshape %616 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_138 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %620 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%619, %618 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_138 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %621 = tosa.reshape %620 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %622 = tosa.add %591, %621 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %623 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_139 = arith.constant 2 : i32
-    %624 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%622 : tensor<1x40x4096xf32>) outs(%623 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_139 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %625 = tosa.reduce_sum %624 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %626 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %627 = tosa.reciprocal %626 : (tensor<1xf32>) -> tensor<1xf32>
-    %628 = tosa.mul %627, %625 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %629 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %630 = tosa.add %628, %629 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %631 = tosa.rsqrt %630 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %632 = tosa.mul %622, %631 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %633 = tosa.reshape %arg48 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %634 = tosa.mul %633, %632 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %635 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %636 = tosa.transpose %arg49, %635 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %637 = tosa.reshape %634 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_140 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %638 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%637, %636 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_140 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %639 = tosa.reshape %638 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %640 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %641 = tosa.transpose %arg50, %640 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %642 = tosa.reshape %634 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_141 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %643 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%642, %641 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_141 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %644 = tosa.reshape %643 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %645 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %646 = tosa.transpose %arg51, %645 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %647 = tosa.reshape %634 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_142 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %648 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%647, %646 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_142 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %649 = tosa.reshape %648 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %650 = tosa.reshape %639 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %651 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %652 = tosa.transpose %650, %651 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %653 = tosa.reshape %644 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %654 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %655 = tosa.transpose %653, %654 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %656 = tosa.reshape %649 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %657 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %658 = tosa.transpose %656, %657 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %659 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %660 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %661 = tosa.mul %652, %659 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_143 = tensor.extract_slice %652[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_144 = tensor.extract_slice %652[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %662 = tensor.empty() : tensor<1x32x40x64xf32>
-    %663 = linalg.negf ins(%extracted_slice_144 : tensor<1x32x40x64xf32>) outs(%662 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %664 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_145 = tensor.insert_slice %663 into %664[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_146 = tensor.insert_slice %extracted_slice_143 into %inserted_slice_145[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %665 = tosa.mul %inserted_slice_146, %660 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %666 = tosa.add %661, %665 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %667 = tosa.mul %655, %659 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_147 = tensor.extract_slice %655[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_148 = tensor.extract_slice %655[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %668 = tensor.empty() : tensor<1x32x40x64xf32>
-    %669 = linalg.negf ins(%extracted_slice_148 : tensor<1x32x40x64xf32>) outs(%668 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %670 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_149 = tensor.insert_slice %669 into %670[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_150 = tensor.insert_slice %extracted_slice_147 into %inserted_slice_149[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %671 = tosa.mul %inserted_slice_150, %660 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %672 = tosa.add %667, %671 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %673 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %674 = tosa.reshape %673 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_151 = tensor.extract_slice %674[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_152 = tensor.extract_slice %extracted_slice_151[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %675 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %676 = tosa.add %extracted_slice_152, %675 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_153 = tensor.extract_slice %676[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_154 = tensor.extract_slice %extracted_slice_153[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_155 = tensor.extract_slice %extracted_slice_154[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_156 = tensor.extract_slice %extracted_slice_155[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_157 = arith.constant 0.000000e+00 : f32
-    %splat_158 = tensor.splat %cst_157 : tensor<40x40xf32>
-    %677 = tosa.reshape %extracted_slice_156 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %678 = tosa.add %splat_158, %677 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %679 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %680 = tosa.transpose %672, %679 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %681 = tosa.reshape %666 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %682 = tosa.reshape %680 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %683 = tosa.matmul %681, %682 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_159 = arith.constant 0.0883883461 : f32
-    %splat_160 = tensor.splat %cst_159 : tensor<32x40x40xf32>
-    %684 = tosa.mul %683, %splat_160 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %685 = tosa.add %684, %678 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %686 = tosa.reduce_max %685 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %687 = tosa.sub %685, %686 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %688 = math.exp %687 : tensor<32x40x40xf32>
-    %689 = tosa.reduce_sum %688 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %690 = tosa.log %689 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %691 = tosa.add %686, %690 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %692 = tosa.sub %685, %691 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %693 = math.exp %692 : tensor<32x40x40xf32>
-    %694 = tosa.reshape %691 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %695 = tosa.reshape %658 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %696 = tosa.matmul %693, %695 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %697 = tosa.reshape %696 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %698 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %699 = tosa.transpose %697, %698 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %700 = tosa.reshape %699 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %701 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %702 = tosa.transpose %arg52, %701 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %703 = tosa.reshape %700 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_161 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %704 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%703, %702 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_161 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %705 = tosa.reshape %704 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %706 = tosa.add %622, %705 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %707 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_162 = arith.constant 2 : i32
-    %708 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%706 : tensor<1x40x4096xf32>) outs(%707 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_162 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %709 = tosa.reduce_sum %708 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %710 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %711 = tosa.reciprocal %710 : (tensor<1xf32>) -> tensor<1xf32>
-    %712 = tosa.mul %711, %709 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %713 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %714 = tosa.add %712, %713 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %715 = tosa.rsqrt %714 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %716 = tosa.mul %706, %715 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %717 = tosa.reshape %arg53 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %718 = tosa.mul %717, %716 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %719 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %720 = tosa.transpose %arg54, %719 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %721 = tosa.reshape %718 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_163 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %722 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%721, %720 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_163 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %723 = tosa.reshape %722 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %724 = tosa.sigmoid %723 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %725 = tosa.mul %723, %724 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %726 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %727 = tosa.transpose %arg55, %726 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %728 = tosa.reshape %718 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_164 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %729 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%728, %727 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_164 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %730 = tosa.reshape %729 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %731 = tosa.mul %725, %730 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %732 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %733 = tosa.transpose %arg56, %732 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %734 = tosa.reshape %731 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_165 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %735 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%734, %733 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_165 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %736 = tosa.reshape %735 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %737 = tosa.add %706, %736 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %738 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_166 = arith.constant 2 : i32
-    %739 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%737 : tensor<1x40x4096xf32>) outs(%738 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_166 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %740 = tosa.reduce_sum %739 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %741 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %742 = tosa.reciprocal %741 : (tensor<1xf32>) -> tensor<1xf32>
-    %743 = tosa.mul %742, %740 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %744 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %745 = tosa.add %743, %744 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %746 = tosa.rsqrt %745 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %747 = tosa.mul %737, %746 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %748 = tosa.reshape %arg57 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %749 = tosa.mul %748, %747 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %750 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %751 = tosa.transpose %arg58, %750 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %752 = tosa.reshape %749 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_167 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %753 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%752, %751 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_167 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %754 = tosa.reshape %753 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %755 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %756 = tosa.transpose %arg59, %755 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %757 = tosa.reshape %749 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_168 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %758 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%757, %756 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_168 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %759 = tosa.reshape %758 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %760 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %761 = tosa.transpose %arg60, %760 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %762 = tosa.reshape %749 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_169 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %763 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%762, %761 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_169 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %764 = tosa.reshape %763 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %765 = tosa.reshape %754 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %766 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %767 = tosa.transpose %765, %766 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %768 = tosa.reshape %759 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %769 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %770 = tosa.transpose %768, %769 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %771 = tosa.reshape %764 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %772 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %773 = tosa.transpose %771, %772 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %774 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %775 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %776 = tosa.mul %767, %774 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_170 = tensor.extract_slice %767[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_171 = tensor.extract_slice %767[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %777 = tensor.empty() : tensor<1x32x40x64xf32>
-    %778 = linalg.negf ins(%extracted_slice_171 : tensor<1x32x40x64xf32>) outs(%777 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %779 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_172 = tensor.insert_slice %778 into %779[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_173 = tensor.insert_slice %extracted_slice_170 into %inserted_slice_172[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %780 = tosa.mul %inserted_slice_173, %775 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %781 = tosa.add %776, %780 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %782 = tosa.mul %770, %774 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_174 = tensor.extract_slice %770[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_175 = tensor.extract_slice %770[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %783 = tensor.empty() : tensor<1x32x40x64xf32>
-    %784 = linalg.negf ins(%extracted_slice_175 : tensor<1x32x40x64xf32>) outs(%783 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %785 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_176 = tensor.insert_slice %784 into %785[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_177 = tensor.insert_slice %extracted_slice_174 into %inserted_slice_176[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %786 = tosa.mul %inserted_slice_177, %775 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %787 = tosa.add %782, %786 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %788 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %789 = tosa.reshape %788 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_178 = tensor.extract_slice %789[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_179 = tensor.extract_slice %extracted_slice_178[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %790 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %791 = tosa.add %extracted_slice_179, %790 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_180 = tensor.extract_slice %791[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_181 = tensor.extract_slice %extracted_slice_180[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_182 = tensor.extract_slice %extracted_slice_181[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_183 = tensor.extract_slice %extracted_slice_182[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_184 = arith.constant 0.000000e+00 : f32
-    %splat_185 = tensor.splat %cst_184 : tensor<40x40xf32>
-    %792 = tosa.reshape %extracted_slice_183 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %793 = tosa.add %splat_185, %792 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %794 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %795 = tosa.transpose %787, %794 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %796 = tosa.reshape %781 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %797 = tosa.reshape %795 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %798 = tosa.matmul %796, %797 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_186 = arith.constant 0.0883883461 : f32
-    %splat_187 = tensor.splat %cst_186 : tensor<32x40x40xf32>
-    %799 = tosa.mul %798, %splat_187 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %800 = tosa.add %799, %793 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %801 = tosa.reduce_max %800 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %802 = tosa.sub %800, %801 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %803 = math.exp %802 : tensor<32x40x40xf32>
-    %804 = tosa.reduce_sum %803 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %805 = tosa.log %804 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %806 = tosa.add %801, %805 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %807 = tosa.sub %800, %806 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %808 = math.exp %807 : tensor<32x40x40xf32>
-    %809 = tosa.reshape %806 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %810 = tosa.reshape %773 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %811 = tosa.matmul %808, %810 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %812 = tosa.reshape %811 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %813 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %814 = tosa.transpose %812, %813 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %815 = tosa.reshape %814 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %816 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %817 = tosa.transpose %arg61, %816 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %818 = tosa.reshape %815 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_188 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %819 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%818, %817 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_188 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %820 = tosa.reshape %819 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %821 = tosa.add %737, %820 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %822 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_189 = arith.constant 2 : i32
-    %823 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%821 : tensor<1x40x4096xf32>) outs(%822 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_189 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %824 = tosa.reduce_sum %823 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %825 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %826 = tosa.reciprocal %825 : (tensor<1xf32>) -> tensor<1xf32>
-    %827 = tosa.mul %826, %824 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %828 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %829 = tosa.add %827, %828 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %830 = tosa.rsqrt %829 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %831 = tosa.mul %821, %830 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %832 = tosa.reshape %arg62 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %833 = tosa.mul %832, %831 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %834 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %835 = tosa.transpose %arg63, %834 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %836 = tosa.reshape %833 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_190 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %837 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%836, %835 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_190 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %838 = tosa.reshape %837 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %839 = tosa.sigmoid %838 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %840 = tosa.mul %838, %839 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %841 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %842 = tosa.transpose %arg64, %841 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %843 = tosa.reshape %833 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_191 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %844 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%843, %842 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_191 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %845 = tosa.reshape %844 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %846 = tosa.mul %840, %845 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %847 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %848 = tosa.transpose %arg65, %847 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %849 = tosa.reshape %846 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_192 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %850 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%849, %848 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_192 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %851 = tosa.reshape %850 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %852 = tosa.add %821, %851 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %853 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_193 = arith.constant 2 : i32
-    %854 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%852 : tensor<1x40x4096xf32>) outs(%853 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_193 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %855 = tosa.reduce_sum %854 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %856 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %857 = tosa.reciprocal %856 : (tensor<1xf32>) -> tensor<1xf32>
-    %858 = tosa.mul %857, %855 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %859 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %860 = tosa.add %858, %859 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %861 = tosa.rsqrt %860 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %862 = tosa.mul %852, %861 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %863 = tosa.reshape %arg66 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %864 = tosa.mul %863, %862 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %865 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %866 = tosa.transpose %arg67, %865 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %867 = tosa.reshape %864 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_194 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %868 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%867, %866 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_194 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %869 = tosa.reshape %868 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %870 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %871 = tosa.transpose %arg68, %870 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %872 = tosa.reshape %864 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_195 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %873 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%872, %871 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_195 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %874 = tosa.reshape %873 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %875 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %876 = tosa.transpose %arg69, %875 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %877 = tosa.reshape %864 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_196 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %878 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%877, %876 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_196 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %879 = tosa.reshape %878 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %880 = tosa.reshape %869 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %881 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %882 = tosa.transpose %880, %881 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %883 = tosa.reshape %874 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %884 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %885 = tosa.transpose %883, %884 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %886 = tosa.reshape %879 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %887 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %888 = tosa.transpose %886, %887 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %889 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %890 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %891 = tosa.mul %882, %889 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_197 = tensor.extract_slice %882[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_198 = tensor.extract_slice %882[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %892 = tensor.empty() : tensor<1x32x40x64xf32>
-    %893 = linalg.negf ins(%extracted_slice_198 : tensor<1x32x40x64xf32>) outs(%892 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %894 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_199 = tensor.insert_slice %893 into %894[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_200 = tensor.insert_slice %extracted_slice_197 into %inserted_slice_199[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %895 = tosa.mul %inserted_slice_200, %890 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %896 = tosa.add %891, %895 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %897 = tosa.mul %885, %889 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_201 = tensor.extract_slice %885[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_202 = tensor.extract_slice %885[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %898 = tensor.empty() : tensor<1x32x40x64xf32>
-    %899 = linalg.negf ins(%extracted_slice_202 : tensor<1x32x40x64xf32>) outs(%898 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %900 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_203 = tensor.insert_slice %899 into %900[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_204 = tensor.insert_slice %extracted_slice_201 into %inserted_slice_203[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %901 = tosa.mul %inserted_slice_204, %890 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %902 = tosa.add %897, %901 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %903 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %904 = tosa.reshape %903 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_205 = tensor.extract_slice %904[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_206 = tensor.extract_slice %extracted_slice_205[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %905 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %906 = tosa.add %extracted_slice_206, %905 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_207 = tensor.extract_slice %906[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_208 = tensor.extract_slice %extracted_slice_207[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_209 = tensor.extract_slice %extracted_slice_208[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_210 = tensor.extract_slice %extracted_slice_209[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_211 = arith.constant 0.000000e+00 : f32
-    %splat_212 = tensor.splat %cst_211 : tensor<40x40xf32>
-    %907 = tosa.reshape %extracted_slice_210 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %908 = tosa.add %splat_212, %907 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %909 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %910 = tosa.transpose %902, %909 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %911 = tosa.reshape %896 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %912 = tosa.reshape %910 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %913 = tosa.matmul %911, %912 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_213 = arith.constant 0.0883883461 : f32
-    %splat_214 = tensor.splat %cst_213 : tensor<32x40x40xf32>
-    %914 = tosa.mul %913, %splat_214 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %915 = tosa.add %914, %908 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %916 = tosa.reduce_max %915 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %917 = tosa.sub %915, %916 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %918 = math.exp %917 : tensor<32x40x40xf32>
-    %919 = tosa.reduce_sum %918 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %920 = tosa.log %919 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %921 = tosa.add %916, %920 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %922 = tosa.sub %915, %921 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %923 = math.exp %922 : tensor<32x40x40xf32>
-    %924 = tosa.reshape %921 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %925 = tosa.reshape %888 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %926 = tosa.matmul %923, %925 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %927 = tosa.reshape %926 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %928 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %929 = tosa.transpose %927, %928 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %930 = tosa.reshape %929 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %931 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %932 = tosa.transpose %arg70, %931 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %933 = tosa.reshape %930 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_215 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %934 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%933, %932 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_215 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %935 = tosa.reshape %934 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %936 = tosa.add %852, %935 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %937 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_216 = arith.constant 2 : i32
-    %938 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%936 : tensor<1x40x4096xf32>) outs(%937 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_216 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %939 = tosa.reduce_sum %938 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %940 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %941 = tosa.reciprocal %940 : (tensor<1xf32>) -> tensor<1xf32>
-    %942 = tosa.mul %941, %939 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %943 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %944 = tosa.add %942, %943 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %945 = tosa.rsqrt %944 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %946 = tosa.mul %936, %945 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %947 = tosa.reshape %arg71 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %948 = tosa.mul %947, %946 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %949 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %950 = tosa.transpose %arg72, %949 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %951 = tosa.reshape %948 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_217 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %952 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%951, %950 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_217 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %953 = tosa.reshape %952 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %954 = tosa.sigmoid %953 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %955 = tosa.mul %953, %954 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %956 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %957 = tosa.transpose %arg73, %956 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %958 = tosa.reshape %948 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_218 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %959 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%958, %957 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_218 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %960 = tosa.reshape %959 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %961 = tosa.mul %955, %960 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %962 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %963 = tosa.transpose %arg74, %962 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %964 = tosa.reshape %961 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_219 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %965 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%964, %963 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_219 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %966 = tosa.reshape %965 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %967 = tosa.add %936, %966 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %968 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_220 = arith.constant 2 : i32
-    %969 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%967 : tensor<1x40x4096xf32>) outs(%968 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_220 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %970 = tosa.reduce_sum %969 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %971 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %972 = tosa.reciprocal %971 : (tensor<1xf32>) -> tensor<1xf32>
-    %973 = tosa.mul %972, %970 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %974 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %975 = tosa.add %973, %974 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %976 = tosa.rsqrt %975 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %977 = tosa.mul %967, %976 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %978 = tosa.reshape %arg75 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %979 = tosa.mul %978, %977 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %980 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %981 = tosa.transpose %arg76, %980 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %982 = tosa.reshape %979 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_221 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %983 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%982, %981 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_221 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %984 = tosa.reshape %983 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %985 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %986 = tosa.transpose %arg77, %985 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %987 = tosa.reshape %979 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_222 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %988 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%987, %986 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_222 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %989 = tosa.reshape %988 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %990 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %991 = tosa.transpose %arg78, %990 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %992 = tosa.reshape %979 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_223 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %993 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%992, %991 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_223 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %994 = tosa.reshape %993 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %995 = tosa.reshape %984 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %996 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %997 = tosa.transpose %995, %996 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %998 = tosa.reshape %989 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %999 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1000 = tosa.transpose %998, %999 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1001 = tosa.reshape %994 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1002 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1003 = tosa.transpose %1001, %1002 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1004 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1005 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1006 = tosa.mul %997, %1004 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_224 = tensor.extract_slice %997[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_225 = tensor.extract_slice %997[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1007 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1008 = linalg.negf ins(%extracted_slice_225 : tensor<1x32x40x64xf32>) outs(%1007 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1009 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_226 = tensor.insert_slice %1008 into %1009[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_227 = tensor.insert_slice %extracted_slice_224 into %inserted_slice_226[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1010 = tosa.mul %inserted_slice_227, %1005 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1011 = tosa.add %1006, %1010 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1012 = tosa.mul %1000, %1004 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_228 = tensor.extract_slice %1000[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_229 = tensor.extract_slice %1000[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1013 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1014 = linalg.negf ins(%extracted_slice_229 : tensor<1x32x40x64xf32>) outs(%1013 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1015 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_230 = tensor.insert_slice %1014 into %1015[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_231 = tensor.insert_slice %extracted_slice_228 into %inserted_slice_230[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1016 = tosa.mul %inserted_slice_231, %1005 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1017 = tosa.add %1012, %1016 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1018 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %1019 = tosa.reshape %1018 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_232 = tensor.extract_slice %1019[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_233 = tensor.extract_slice %extracted_slice_232[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %1020 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %1021 = tosa.add %extracted_slice_233, %1020 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_234 = tensor.extract_slice %1021[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_235 = tensor.extract_slice %extracted_slice_234[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_236 = tensor.extract_slice %extracted_slice_235[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_237 = tensor.extract_slice %extracted_slice_236[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_238 = arith.constant 0.000000e+00 : f32
-    %splat_239 = tensor.splat %cst_238 : tensor<40x40xf32>
-    %1022 = tosa.reshape %extracted_slice_237 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %1023 = tosa.add %splat_239, %1022 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %1024 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1025 = tosa.transpose %1017, %1024 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %1026 = tosa.reshape %1011 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1027 = tosa.reshape %1025 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %1028 = tosa.matmul %1026, %1027 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_240 = arith.constant 0.0883883461 : f32
-    %splat_241 = tensor.splat %cst_240 : tensor<32x40x40xf32>
-    %1029 = tosa.mul %1028, %splat_241 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %1030 = tosa.add %1029, %1023 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %1031 = tosa.reduce_max %1030 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1032 = tosa.sub %1030, %1031 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1033 = math.exp %1032 : tensor<32x40x40xf32>
-    %1034 = tosa.reduce_sum %1033 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1035 = tosa.log %1034 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1036 = tosa.add %1031, %1035 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1037 = tosa.sub %1030, %1036 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1038 = math.exp %1037 : tensor<32x40x40xf32>
-    %1039 = tosa.reshape %1036 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %1040 = tosa.reshape %1003 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1041 = tosa.matmul %1038, %1040 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1042 = tosa.reshape %1041 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1043 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1044 = tosa.transpose %1042, %1043 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %1045 = tosa.reshape %1044 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %1046 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1047 = tosa.transpose %arg79, %1046 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1048 = tosa.reshape %1045 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_242 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1049 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1048, %1047 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_242 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1050 = tosa.reshape %1049 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1051 = tosa.add %967, %1050 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1052 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_243 = arith.constant 2 : i32
-    %1053 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1051 : tensor<1x40x4096xf32>) outs(%1052 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_243 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1054 = tosa.reduce_sum %1053 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1055 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1056 = tosa.reciprocal %1055 : (tensor<1xf32>) -> tensor<1xf32>
-    %1057 = tosa.mul %1056, %1054 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1058 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1059 = tosa.add %1057, %1058 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1060 = tosa.rsqrt %1059 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1061 = tosa.mul %1051, %1060 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1062 = tosa.reshape %arg80 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1063 = tosa.mul %1062, %1061 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1064 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1065 = tosa.transpose %arg81, %1064 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1066 = tosa.reshape %1063 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_244 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1067 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1066, %1065 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_244 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1068 = tosa.reshape %1067 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1069 = tosa.sigmoid %1068 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1070 = tosa.mul %1068, %1069 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1071 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1072 = tosa.transpose %arg82, %1071 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1073 = tosa.reshape %1063 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_245 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1074 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1073, %1072 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_245 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1075 = tosa.reshape %1074 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1076 = tosa.mul %1070, %1075 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1077 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1078 = tosa.transpose %arg83, %1077 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %1079 = tosa.reshape %1076 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_246 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1080 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1079, %1078 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_246 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1081 = tosa.reshape %1080 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1082 = tosa.add %1051, %1081 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1083 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_247 = arith.constant 2 : i32
-    %1084 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1082 : tensor<1x40x4096xf32>) outs(%1083 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_247 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1085 = tosa.reduce_sum %1084 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1086 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1087 = tosa.reciprocal %1086 : (tensor<1xf32>) -> tensor<1xf32>
-    %1088 = tosa.mul %1087, %1085 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1089 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1090 = tosa.add %1088, %1089 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1091 = tosa.rsqrt %1090 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1092 = tosa.mul %1082, %1091 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1093 = tosa.reshape %arg84 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1094 = tosa.mul %1093, %1092 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1095 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1096 = tosa.transpose %arg85, %1095 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1097 = tosa.reshape %1094 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_248 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1098 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1097, %1096 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_248 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1099 = tosa.reshape %1098 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1100 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1101 = tosa.transpose %arg86, %1100 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1102 = tosa.reshape %1094 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_249 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1103 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1102, %1101 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_249 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1104 = tosa.reshape %1103 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1105 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1106 = tosa.transpose %arg87, %1105 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1107 = tosa.reshape %1094 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_250 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1108 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1107, %1106 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_250 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1109 = tosa.reshape %1108 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1110 = tosa.reshape %1099 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1111 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1112 = tosa.transpose %1110, %1111 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1113 = tosa.reshape %1104 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1114 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1115 = tosa.transpose %1113, %1114 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1116 = tosa.reshape %1109 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1117 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1118 = tosa.transpose %1116, %1117 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1119 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1120 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1121 = tosa.mul %1112, %1119 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_251 = tensor.extract_slice %1112[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_252 = tensor.extract_slice %1112[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1122 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1123 = linalg.negf ins(%extracted_slice_252 : tensor<1x32x40x64xf32>) outs(%1122 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1124 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_253 = tensor.insert_slice %1123 into %1124[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_254 = tensor.insert_slice %extracted_slice_251 into %inserted_slice_253[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1125 = tosa.mul %inserted_slice_254, %1120 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1126 = tosa.add %1121, %1125 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1127 = tosa.mul %1115, %1119 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_255 = tensor.extract_slice %1115[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_256 = tensor.extract_slice %1115[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1128 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1129 = linalg.negf ins(%extracted_slice_256 : tensor<1x32x40x64xf32>) outs(%1128 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1130 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_257 = tensor.insert_slice %1129 into %1130[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_258 = tensor.insert_slice %extracted_slice_255 into %inserted_slice_257[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1131 = tosa.mul %inserted_slice_258, %1120 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1132 = tosa.add %1127, %1131 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1133 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %1134 = tosa.reshape %1133 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_259 = tensor.extract_slice %1134[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_260 = tensor.extract_slice %extracted_slice_259[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %1135 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %1136 = tosa.add %extracted_slice_260, %1135 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_261 = tensor.extract_slice %1136[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_262 = tensor.extract_slice %extracted_slice_261[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_263 = tensor.extract_slice %extracted_slice_262[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_264 = tensor.extract_slice %extracted_slice_263[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_265 = arith.constant 0.000000e+00 : f32
-    %splat_266 = tensor.splat %cst_265 : tensor<40x40xf32>
-    %1137 = tosa.reshape %extracted_slice_264 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %1138 = tosa.add %splat_266, %1137 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %1139 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1140 = tosa.transpose %1132, %1139 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %1141 = tosa.reshape %1126 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1142 = tosa.reshape %1140 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %1143 = tosa.matmul %1141, %1142 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_267 = arith.constant 0.0883883461 : f32
-    %splat_268 = tensor.splat %cst_267 : tensor<32x40x40xf32>
-    %1144 = tosa.mul %1143, %splat_268 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %1145 = tosa.add %1144, %1138 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %1146 = tosa.reduce_max %1145 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1147 = tosa.sub %1145, %1146 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1148 = math.exp %1147 : tensor<32x40x40xf32>
-    %1149 = tosa.reduce_sum %1148 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1150 = tosa.log %1149 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1151 = tosa.add %1146, %1150 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1152 = tosa.sub %1145, %1151 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1153 = math.exp %1152 : tensor<32x40x40xf32>
-    %1154 = tosa.reshape %1151 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %1155 = tosa.reshape %1118 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1156 = tosa.matmul %1153, %1155 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1157 = tosa.reshape %1156 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1158 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1159 = tosa.transpose %1157, %1158 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %1160 = tosa.reshape %1159 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %1161 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1162 = tosa.transpose %arg88, %1161 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1163 = tosa.reshape %1160 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_269 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1164 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1163, %1162 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_269 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1165 = tosa.reshape %1164 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1166 = tosa.add %1082, %1165 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1167 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_270 = arith.constant 2 : i32
-    %1168 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1166 : tensor<1x40x4096xf32>) outs(%1167 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_270 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1169 = tosa.reduce_sum %1168 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1170 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1171 = tosa.reciprocal %1170 : (tensor<1xf32>) -> tensor<1xf32>
-    %1172 = tosa.mul %1171, %1169 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1173 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1174 = tosa.add %1172, %1173 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1175 = tosa.rsqrt %1174 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1176 = tosa.mul %1166, %1175 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1177 = tosa.reshape %arg89 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1178 = tosa.mul %1177, %1176 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1179 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1180 = tosa.transpose %arg90, %1179 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1181 = tosa.reshape %1178 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_271 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1182 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1181, %1180 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_271 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1183 = tosa.reshape %1182 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1184 = tosa.sigmoid %1183 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1185 = tosa.mul %1183, %1184 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1186 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1187 = tosa.transpose %arg91, %1186 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1188 = tosa.reshape %1178 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_272 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1189 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1188, %1187 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_272 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1190 = tosa.reshape %1189 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1191 = tosa.mul %1185, %1190 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1192 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1193 = tosa.transpose %arg92, %1192 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %1194 = tosa.reshape %1191 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_273 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1195 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1194, %1193 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_273 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1196 = tosa.reshape %1195 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1197 = tosa.add %1166, %1196 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1198 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_274 = arith.constant 2 : i32
-    %1199 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1197 : tensor<1x40x4096xf32>) outs(%1198 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_274 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1200 = tosa.reduce_sum %1199 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1201 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1202 = tosa.reciprocal %1201 : (tensor<1xf32>) -> tensor<1xf32>
-    %1203 = tosa.mul %1202, %1200 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1204 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1205 = tosa.add %1203, %1204 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1206 = tosa.rsqrt %1205 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1207 = tosa.mul %1197, %1206 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1208 = tosa.reshape %arg93 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1209 = tosa.mul %1208, %1207 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1210 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1211 = tosa.transpose %arg94, %1210 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1212 = tosa.reshape %1209 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_275 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1213 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1212, %1211 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_275 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1214 = tosa.reshape %1213 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1215 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1216 = tosa.transpose %arg95, %1215 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1217 = tosa.reshape %1209 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_276 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1218 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1217, %1216 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_276 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1219 = tosa.reshape %1218 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1220 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1221 = tosa.transpose %arg96, %1220 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1222 = tosa.reshape %1209 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_277 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1223 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1222, %1221 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_277 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1224 = tosa.reshape %1223 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1225 = tosa.reshape %1214 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1226 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1227 = tosa.transpose %1225, %1226 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1228 = tosa.reshape %1219 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1229 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1230 = tosa.transpose %1228, %1229 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1231 = tosa.reshape %1224 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1232 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1233 = tosa.transpose %1231, %1232 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1234 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1235 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1236 = tosa.mul %1227, %1234 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_278 = tensor.extract_slice %1227[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_279 = tensor.extract_slice %1227[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1237 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1238 = linalg.negf ins(%extracted_slice_279 : tensor<1x32x40x64xf32>) outs(%1237 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1239 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_280 = tensor.insert_slice %1238 into %1239[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_281 = tensor.insert_slice %extracted_slice_278 into %inserted_slice_280[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1240 = tosa.mul %inserted_slice_281, %1235 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1241 = tosa.add %1236, %1240 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1242 = tosa.mul %1230, %1234 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_282 = tensor.extract_slice %1230[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_283 = tensor.extract_slice %1230[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1243 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1244 = linalg.negf ins(%extracted_slice_283 : tensor<1x32x40x64xf32>) outs(%1243 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1245 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_284 = tensor.insert_slice %1244 into %1245[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_285 = tensor.insert_slice %extracted_slice_282 into %inserted_slice_284[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1246 = tosa.mul %inserted_slice_285, %1235 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1247 = tosa.add %1242, %1246 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1248 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %1249 = tosa.reshape %1248 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_286 = tensor.extract_slice %1249[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_287 = tensor.extract_slice %extracted_slice_286[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %1250 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %1251 = tosa.add %extracted_slice_287, %1250 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_288 = tensor.extract_slice %1251[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_289 = tensor.extract_slice %extracted_slice_288[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_290 = tensor.extract_slice %extracted_slice_289[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_291 = tensor.extract_slice %extracted_slice_290[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_292 = arith.constant 0.000000e+00 : f32
-    %splat_293 = tensor.splat %cst_292 : tensor<40x40xf32>
-    %1252 = tosa.reshape %extracted_slice_291 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %1253 = tosa.add %splat_293, %1252 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %1254 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1255 = tosa.transpose %1247, %1254 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %1256 = tosa.reshape %1241 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1257 = tosa.reshape %1255 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %1258 = tosa.matmul %1256, %1257 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_294 = arith.constant 0.0883883461 : f32
-    %splat_295 = tensor.splat %cst_294 : tensor<32x40x40xf32>
-    %1259 = tosa.mul %1258, %splat_295 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %1260 = tosa.add %1259, %1253 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %1261 = tosa.reduce_max %1260 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1262 = tosa.sub %1260, %1261 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1263 = math.exp %1262 : tensor<32x40x40xf32>
-    %1264 = tosa.reduce_sum %1263 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1265 = tosa.log %1264 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1266 = tosa.add %1261, %1265 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1267 = tosa.sub %1260, %1266 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1268 = math.exp %1267 : tensor<32x40x40xf32>
-    %1269 = tosa.reshape %1266 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %1270 = tosa.reshape %1233 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1271 = tosa.matmul %1268, %1270 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1272 = tosa.reshape %1271 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1273 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1274 = tosa.transpose %1272, %1273 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %1275 = tosa.reshape %1274 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %1276 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1277 = tosa.transpose %arg97, %1276 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1278 = tosa.reshape %1275 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_296 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1279 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1278, %1277 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_296 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1280 = tosa.reshape %1279 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1281 = tosa.add %1197, %1280 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1282 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_297 = arith.constant 2 : i32
-    %1283 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1281 : tensor<1x40x4096xf32>) outs(%1282 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_297 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1284 = tosa.reduce_sum %1283 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1285 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1286 = tosa.reciprocal %1285 : (tensor<1xf32>) -> tensor<1xf32>
-    %1287 = tosa.mul %1286, %1284 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1288 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1289 = tosa.add %1287, %1288 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1290 = tosa.rsqrt %1289 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1291 = tosa.mul %1281, %1290 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1292 = tosa.reshape %arg98 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1293 = tosa.mul %1292, %1291 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1294 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1295 = tosa.transpose %arg99, %1294 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1296 = tosa.reshape %1293 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_298 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1297 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1296, %1295 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_298 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1298 = tosa.reshape %1297 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1299 = tosa.sigmoid %1298 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1300 = tosa.mul %1298, %1299 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1301 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1302 = tosa.transpose %arg100, %1301 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1303 = tosa.reshape %1293 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_299 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1304 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1303, %1302 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_299 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1305 = tosa.reshape %1304 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1306 = tosa.mul %1300, %1305 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1307 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1308 = tosa.transpose %arg101, %1307 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %1309 = tosa.reshape %1306 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_300 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1310 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1309, %1308 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_300 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1311 = tosa.reshape %1310 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1312 = tosa.add %1281, %1311 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1313 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_301 = arith.constant 2 : i32
-    %1314 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1312 : tensor<1x40x4096xf32>) outs(%1313 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_301 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1315 = tosa.reduce_sum %1314 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1316 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1317 = tosa.reciprocal %1316 : (tensor<1xf32>) -> tensor<1xf32>
-    %1318 = tosa.mul %1317, %1315 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1319 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1320 = tosa.add %1318, %1319 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1321 = tosa.rsqrt %1320 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1322 = tosa.mul %1312, %1321 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1323 = tosa.reshape %arg102 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1324 = tosa.mul %1323, %1322 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1325 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1326 = tosa.transpose %arg103, %1325 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1327 = tosa.reshape %1324 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_302 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1328 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1327, %1326 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_302 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1329 = tosa.reshape %1328 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1330 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1331 = tosa.transpose %arg104, %1330 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1332 = tosa.reshape %1324 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_303 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1333 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1332, %1331 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_303 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1334 = tosa.reshape %1333 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1335 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1336 = tosa.transpose %arg105, %1335 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1337 = tosa.reshape %1324 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_304 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1338 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1337, %1336 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_304 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1339 = tosa.reshape %1338 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1340 = tosa.reshape %1329 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1341 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1342 = tosa.transpose %1340, %1341 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1343 = tosa.reshape %1334 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1344 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1345 = tosa.transpose %1343, %1344 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1346 = tosa.reshape %1339 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1347 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1348 = tosa.transpose %1346, %1347 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1349 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1350 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1351 = tosa.mul %1342, %1349 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_305 = tensor.extract_slice %1342[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_306 = tensor.extract_slice %1342[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1352 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1353 = linalg.negf ins(%extracted_slice_306 : tensor<1x32x40x64xf32>) outs(%1352 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1354 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_307 = tensor.insert_slice %1353 into %1354[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_308 = tensor.insert_slice %extracted_slice_305 into %inserted_slice_307[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1355 = tosa.mul %inserted_slice_308, %1350 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1356 = tosa.add %1351, %1355 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1357 = tosa.mul %1345, %1349 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_309 = tensor.extract_slice %1345[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_310 = tensor.extract_slice %1345[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1358 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1359 = linalg.negf ins(%extracted_slice_310 : tensor<1x32x40x64xf32>) outs(%1358 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1360 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_311 = tensor.insert_slice %1359 into %1360[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_312 = tensor.insert_slice %extracted_slice_309 into %inserted_slice_311[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1361 = tosa.mul %inserted_slice_312, %1350 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1362 = tosa.add %1357, %1361 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1363 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %1364 = tosa.reshape %1363 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_313 = tensor.extract_slice %1364[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_314 = tensor.extract_slice %extracted_slice_313[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %1365 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %1366 = tosa.add %extracted_slice_314, %1365 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_315 = tensor.extract_slice %1366[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_316 = tensor.extract_slice %extracted_slice_315[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_317 = tensor.extract_slice %extracted_slice_316[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_318 = tensor.extract_slice %extracted_slice_317[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_319 = arith.constant 0.000000e+00 : f32
-    %splat_320 = tensor.splat %cst_319 : tensor<40x40xf32>
-    %1367 = tosa.reshape %extracted_slice_318 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %1368 = tosa.add %splat_320, %1367 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %1369 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1370 = tosa.transpose %1362, %1369 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %1371 = tosa.reshape %1356 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1372 = tosa.reshape %1370 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %1373 = tosa.matmul %1371, %1372 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_321 = arith.constant 0.0883883461 : f32
-    %splat_322 = tensor.splat %cst_321 : tensor<32x40x40xf32>
-    %1374 = tosa.mul %1373, %splat_322 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %1375 = tosa.add %1374, %1368 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %1376 = tosa.reduce_max %1375 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1377 = tosa.sub %1375, %1376 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1378 = math.exp %1377 : tensor<32x40x40xf32>
-    %1379 = tosa.reduce_sum %1378 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1380 = tosa.log %1379 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1381 = tosa.add %1376, %1380 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1382 = tosa.sub %1375, %1381 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1383 = math.exp %1382 : tensor<32x40x40xf32>
-    %1384 = tosa.reshape %1381 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %1385 = tosa.reshape %1348 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1386 = tosa.matmul %1383, %1385 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1387 = tosa.reshape %1386 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1388 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1389 = tosa.transpose %1387, %1388 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %1390 = tosa.reshape %1389 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %1391 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1392 = tosa.transpose %arg106, %1391 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1393 = tosa.reshape %1390 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_323 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1394 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1393, %1392 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_323 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1395 = tosa.reshape %1394 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1396 = tosa.add %1312, %1395 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1397 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_324 = arith.constant 2 : i32
-    %1398 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1396 : tensor<1x40x4096xf32>) outs(%1397 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_324 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1399 = tosa.reduce_sum %1398 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1400 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1401 = tosa.reciprocal %1400 : (tensor<1xf32>) -> tensor<1xf32>
-    %1402 = tosa.mul %1401, %1399 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1403 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1404 = tosa.add %1402, %1403 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1405 = tosa.rsqrt %1404 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1406 = tosa.mul %1396, %1405 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1407 = tosa.reshape %arg107 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1408 = tosa.mul %1407, %1406 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1409 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1410 = tosa.transpose %arg108, %1409 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1411 = tosa.reshape %1408 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_325 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1412 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1411, %1410 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_325 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1413 = tosa.reshape %1412 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1414 = tosa.sigmoid %1413 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1415 = tosa.mul %1413, %1414 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1416 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1417 = tosa.transpose %arg109, %1416 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1418 = tosa.reshape %1408 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_326 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1419 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1418, %1417 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_326 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1420 = tosa.reshape %1419 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1421 = tosa.mul %1415, %1420 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1422 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1423 = tosa.transpose %arg110, %1422 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %1424 = tosa.reshape %1421 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_327 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1425 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1424, %1423 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_327 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1426 = tosa.reshape %1425 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1427 = tosa.add %1396, %1426 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1428 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_328 = arith.constant 2 : i32
-    %1429 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1427 : tensor<1x40x4096xf32>) outs(%1428 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_328 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1430 = tosa.reduce_sum %1429 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1431 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1432 = tosa.reciprocal %1431 : (tensor<1xf32>) -> tensor<1xf32>
-    %1433 = tosa.mul %1432, %1430 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1434 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1435 = tosa.add %1433, %1434 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1436 = tosa.rsqrt %1435 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1437 = tosa.mul %1427, %1436 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1438 = tosa.reshape %arg111 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1439 = tosa.mul %1438, %1437 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1440 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1441 = tosa.transpose %arg112, %1440 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1442 = tosa.reshape %1439 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_329 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1443 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1442, %1441 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_329 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1444 = tosa.reshape %1443 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1445 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1446 = tosa.transpose %arg113, %1445 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1447 = tosa.reshape %1439 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_330 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1448 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1447, %1446 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_330 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1449 = tosa.reshape %1448 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1450 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1451 = tosa.transpose %arg114, %1450 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1452 = tosa.reshape %1439 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_331 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1453 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1452, %1451 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_331 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1454 = tosa.reshape %1453 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1455 = tosa.reshape %1444 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1456 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1457 = tosa.transpose %1455, %1456 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1458 = tosa.reshape %1449 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1459 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1460 = tosa.transpose %1458, %1459 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1461 = tosa.reshape %1454 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1462 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1463 = tosa.transpose %1461, %1462 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1464 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1465 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1466 = tosa.mul %1457, %1464 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_332 = tensor.extract_slice %1457[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_333 = tensor.extract_slice %1457[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1467 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1468 = linalg.negf ins(%extracted_slice_333 : tensor<1x32x40x64xf32>) outs(%1467 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1469 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_334 = tensor.insert_slice %1468 into %1469[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_335 = tensor.insert_slice %extracted_slice_332 into %inserted_slice_334[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1470 = tosa.mul %inserted_slice_335, %1465 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1471 = tosa.add %1466, %1470 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1472 = tosa.mul %1460, %1464 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_336 = tensor.extract_slice %1460[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_337 = tensor.extract_slice %1460[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1473 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1474 = linalg.negf ins(%extracted_slice_337 : tensor<1x32x40x64xf32>) outs(%1473 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1475 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_338 = tensor.insert_slice %1474 into %1475[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_339 = tensor.insert_slice %extracted_slice_336 into %inserted_slice_338[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1476 = tosa.mul %inserted_slice_339, %1465 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1477 = tosa.add %1472, %1476 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1478 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %1479 = tosa.reshape %1478 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_340 = tensor.extract_slice %1479[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_341 = tensor.extract_slice %extracted_slice_340[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %1480 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %1481 = tosa.add %extracted_slice_341, %1480 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_342 = tensor.extract_slice %1481[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_343 = tensor.extract_slice %extracted_slice_342[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_344 = tensor.extract_slice %extracted_slice_343[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_345 = tensor.extract_slice %extracted_slice_344[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_346 = arith.constant 0.000000e+00 : f32
-    %splat_347 = tensor.splat %cst_346 : tensor<40x40xf32>
-    %1482 = tosa.reshape %extracted_slice_345 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %1483 = tosa.add %splat_347, %1482 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %1484 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1485 = tosa.transpose %1477, %1484 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %1486 = tosa.reshape %1471 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1487 = tosa.reshape %1485 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %1488 = tosa.matmul %1486, %1487 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_348 = arith.constant 0.0883883461 : f32
-    %splat_349 = tensor.splat %cst_348 : tensor<32x40x40xf32>
-    %1489 = tosa.mul %1488, %splat_349 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %1490 = tosa.add %1489, %1483 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %1491 = tosa.reduce_max %1490 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1492 = tosa.sub %1490, %1491 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1493 = math.exp %1492 : tensor<32x40x40xf32>
-    %1494 = tosa.reduce_sum %1493 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1495 = tosa.log %1494 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1496 = tosa.add %1491, %1495 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1497 = tosa.sub %1490, %1496 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1498 = math.exp %1497 : tensor<32x40x40xf32>
-    %1499 = tosa.reshape %1496 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %1500 = tosa.reshape %1463 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1501 = tosa.matmul %1498, %1500 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1502 = tosa.reshape %1501 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1503 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1504 = tosa.transpose %1502, %1503 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %1505 = tosa.reshape %1504 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %1506 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1507 = tosa.transpose %arg115, %1506 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1508 = tosa.reshape %1505 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_350 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1509 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1508, %1507 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_350 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1510 = tosa.reshape %1509 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1511 = tosa.add %1427, %1510 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1512 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_351 = arith.constant 2 : i32
-    %1513 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1511 : tensor<1x40x4096xf32>) outs(%1512 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_351 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1514 = tosa.reduce_sum %1513 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1515 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1516 = tosa.reciprocal %1515 : (tensor<1xf32>) -> tensor<1xf32>
-    %1517 = tosa.mul %1516, %1514 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1518 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1519 = tosa.add %1517, %1518 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1520 = tosa.rsqrt %1519 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1521 = tosa.mul %1511, %1520 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1522 = tosa.reshape %arg116 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1523 = tosa.mul %1522, %1521 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1524 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1525 = tosa.transpose %arg117, %1524 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1526 = tosa.reshape %1523 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_352 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1527 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1526, %1525 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_352 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1528 = tosa.reshape %1527 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1529 = tosa.sigmoid %1528 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1530 = tosa.mul %1528, %1529 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1531 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1532 = tosa.transpose %arg118, %1531 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1533 = tosa.reshape %1523 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_353 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1534 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1533, %1532 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_353 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1535 = tosa.reshape %1534 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1536 = tosa.mul %1530, %1535 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1537 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1538 = tosa.transpose %arg119, %1537 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %1539 = tosa.reshape %1536 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_354 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1540 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1539, %1538 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_354 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1541 = tosa.reshape %1540 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1542 = tosa.add %1511, %1541 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1543 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_355 = arith.constant 2 : i32
-    %1544 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1542 : tensor<1x40x4096xf32>) outs(%1543 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_355 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1545 = tosa.reduce_sum %1544 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1546 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1547 = tosa.reciprocal %1546 : (tensor<1xf32>) -> tensor<1xf32>
-    %1548 = tosa.mul %1547, %1545 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1549 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1550 = tosa.add %1548, %1549 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1551 = tosa.rsqrt %1550 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1552 = tosa.mul %1542, %1551 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1553 = tosa.reshape %arg120 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1554 = tosa.mul %1553, %1552 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1555 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1556 = tosa.transpose %arg121, %1555 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1557 = tosa.reshape %1554 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_356 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1558 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1557, %1556 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_356 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1559 = tosa.reshape %1558 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1560 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1561 = tosa.transpose %arg122, %1560 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1562 = tosa.reshape %1554 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_357 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1563 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1562, %1561 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_357 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1564 = tosa.reshape %1563 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1565 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1566 = tosa.transpose %arg123, %1565 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1567 = tosa.reshape %1554 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_358 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1568 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1567, %1566 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_358 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1569 = tosa.reshape %1568 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1570 = tosa.reshape %1559 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1571 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1572 = tosa.transpose %1570, %1571 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1573 = tosa.reshape %1564 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1574 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1575 = tosa.transpose %1573, %1574 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1576 = tosa.reshape %1569 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1577 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1578 = tosa.transpose %1576, %1577 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1579 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1580 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1581 = tosa.mul %1572, %1579 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_359 = tensor.extract_slice %1572[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_360 = tensor.extract_slice %1572[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1582 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1583 = linalg.negf ins(%extracted_slice_360 : tensor<1x32x40x64xf32>) outs(%1582 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1584 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_361 = tensor.insert_slice %1583 into %1584[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_362 = tensor.insert_slice %extracted_slice_359 into %inserted_slice_361[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1585 = tosa.mul %inserted_slice_362, %1580 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1586 = tosa.add %1581, %1585 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1587 = tosa.mul %1575, %1579 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_363 = tensor.extract_slice %1575[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_364 = tensor.extract_slice %1575[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1588 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1589 = linalg.negf ins(%extracted_slice_364 : tensor<1x32x40x64xf32>) outs(%1588 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1590 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_365 = tensor.insert_slice %1589 into %1590[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_366 = tensor.insert_slice %extracted_slice_363 into %inserted_slice_365[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1591 = tosa.mul %inserted_slice_366, %1580 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1592 = tosa.add %1587, %1591 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1593 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %1594 = tosa.reshape %1593 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_367 = tensor.extract_slice %1594[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_368 = tensor.extract_slice %extracted_slice_367[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %1595 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %1596 = tosa.add %extracted_slice_368, %1595 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_369 = tensor.extract_slice %1596[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_370 = tensor.extract_slice %extracted_slice_369[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_371 = tensor.extract_slice %extracted_slice_370[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_372 = tensor.extract_slice %extracted_slice_371[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_373 = arith.constant 0.000000e+00 : f32
-    %splat_374 = tensor.splat %cst_373 : tensor<40x40xf32>
-    %1597 = tosa.reshape %extracted_slice_372 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %1598 = tosa.add %splat_374, %1597 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %1599 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1600 = tosa.transpose %1592, %1599 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %1601 = tosa.reshape %1586 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1602 = tosa.reshape %1600 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %1603 = tosa.matmul %1601, %1602 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_375 = arith.constant 0.0883883461 : f32
-    %splat_376 = tensor.splat %cst_375 : tensor<32x40x40xf32>
-    %1604 = tosa.mul %1603, %splat_376 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %1605 = tosa.add %1604, %1598 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %1606 = tosa.reduce_max %1605 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1607 = tosa.sub %1605, %1606 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1608 = math.exp %1607 : tensor<32x40x40xf32>
-    %1609 = tosa.reduce_sum %1608 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1610 = tosa.log %1609 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1611 = tosa.add %1606, %1610 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1612 = tosa.sub %1605, %1611 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1613 = math.exp %1612 : tensor<32x40x40xf32>
-    %1614 = tosa.reshape %1611 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %1615 = tosa.reshape %1578 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1616 = tosa.matmul %1613, %1615 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1617 = tosa.reshape %1616 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1618 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1619 = tosa.transpose %1617, %1618 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %1620 = tosa.reshape %1619 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %1621 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1622 = tosa.transpose %arg124, %1621 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1623 = tosa.reshape %1620 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_377 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1624 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1623, %1622 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_377 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1625 = tosa.reshape %1624 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1626 = tosa.add %1542, %1625 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1627 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_378 = arith.constant 2 : i32
-    %1628 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1626 : tensor<1x40x4096xf32>) outs(%1627 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_378 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1629 = tosa.reduce_sum %1628 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1630 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1631 = tosa.reciprocal %1630 : (tensor<1xf32>) -> tensor<1xf32>
-    %1632 = tosa.mul %1631, %1629 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1633 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1634 = tosa.add %1632, %1633 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1635 = tosa.rsqrt %1634 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1636 = tosa.mul %1626, %1635 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1637 = tosa.reshape %arg125 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1638 = tosa.mul %1637, %1636 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1639 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1640 = tosa.transpose %arg126, %1639 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1641 = tosa.reshape %1638 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_379 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1642 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1641, %1640 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_379 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1643 = tosa.reshape %1642 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1644 = tosa.sigmoid %1643 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1645 = tosa.mul %1643, %1644 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1646 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1647 = tosa.transpose %arg127, %1646 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1648 = tosa.reshape %1638 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_380 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1649 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1648, %1647 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_380 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1650 = tosa.reshape %1649 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1651 = tosa.mul %1645, %1650 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1652 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1653 = tosa.transpose %arg128, %1652 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %1654 = tosa.reshape %1651 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_381 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1655 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1654, %1653 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_381 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1656 = tosa.reshape %1655 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1657 = tosa.add %1626, %1656 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1658 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_382 = arith.constant 2 : i32
-    %1659 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1657 : tensor<1x40x4096xf32>) outs(%1658 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_382 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1660 = tosa.reduce_sum %1659 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1661 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1662 = tosa.reciprocal %1661 : (tensor<1xf32>) -> tensor<1xf32>
-    %1663 = tosa.mul %1662, %1660 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1664 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1665 = tosa.add %1663, %1664 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1666 = tosa.rsqrt %1665 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1667 = tosa.mul %1657, %1666 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1668 = tosa.reshape %arg129 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1669 = tosa.mul %1668, %1667 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1670 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1671 = tosa.transpose %arg130, %1670 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1672 = tosa.reshape %1669 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_383 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1673 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1672, %1671 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_383 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1674 = tosa.reshape %1673 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1675 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1676 = tosa.transpose %arg131, %1675 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1677 = tosa.reshape %1669 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_384 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1678 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1677, %1676 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_384 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1679 = tosa.reshape %1678 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1680 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1681 = tosa.transpose %arg132, %1680 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1682 = tosa.reshape %1669 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_385 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1683 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1682, %1681 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_385 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1684 = tosa.reshape %1683 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1685 = tosa.reshape %1674 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1686 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1687 = tosa.transpose %1685, %1686 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1688 = tosa.reshape %1679 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1689 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1690 = tosa.transpose %1688, %1689 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1691 = tosa.reshape %1684 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1692 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1693 = tosa.transpose %1691, %1692 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1694 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1695 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1696 = tosa.mul %1687, %1694 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_386 = tensor.extract_slice %1687[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_387 = tensor.extract_slice %1687[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1697 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1698 = linalg.negf ins(%extracted_slice_387 : tensor<1x32x40x64xf32>) outs(%1697 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1699 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_388 = tensor.insert_slice %1698 into %1699[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_389 = tensor.insert_slice %extracted_slice_386 into %inserted_slice_388[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1700 = tosa.mul %inserted_slice_389, %1695 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1701 = tosa.add %1696, %1700 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1702 = tosa.mul %1690, %1694 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_390 = tensor.extract_slice %1690[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_391 = tensor.extract_slice %1690[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1703 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1704 = linalg.negf ins(%extracted_slice_391 : tensor<1x32x40x64xf32>) outs(%1703 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1705 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_392 = tensor.insert_slice %1704 into %1705[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_393 = tensor.insert_slice %extracted_slice_390 into %inserted_slice_392[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1706 = tosa.mul %inserted_slice_393, %1695 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1707 = tosa.add %1702, %1706 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1708 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %1709 = tosa.reshape %1708 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_394 = tensor.extract_slice %1709[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_395 = tensor.extract_slice %extracted_slice_394[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %1710 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %1711 = tosa.add %extracted_slice_395, %1710 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_396 = tensor.extract_slice %1711[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_397 = tensor.extract_slice %extracted_slice_396[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_398 = tensor.extract_slice %extracted_slice_397[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_399 = tensor.extract_slice %extracted_slice_398[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_400 = arith.constant 0.000000e+00 : f32
-    %splat_401 = tensor.splat %cst_400 : tensor<40x40xf32>
-    %1712 = tosa.reshape %extracted_slice_399 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %1713 = tosa.add %splat_401, %1712 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %1714 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1715 = tosa.transpose %1707, %1714 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %1716 = tosa.reshape %1701 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1717 = tosa.reshape %1715 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %1718 = tosa.matmul %1716, %1717 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_402 = arith.constant 0.0883883461 : f32
-    %splat_403 = tensor.splat %cst_402 : tensor<32x40x40xf32>
-    %1719 = tosa.mul %1718, %splat_403 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %1720 = tosa.add %1719, %1713 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %1721 = tosa.reduce_max %1720 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1722 = tosa.sub %1720, %1721 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1723 = math.exp %1722 : tensor<32x40x40xf32>
-    %1724 = tosa.reduce_sum %1723 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1725 = tosa.log %1724 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1726 = tosa.add %1721, %1725 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1727 = tosa.sub %1720, %1726 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1728 = math.exp %1727 : tensor<32x40x40xf32>
-    %1729 = tosa.reshape %1726 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %1730 = tosa.reshape %1693 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1731 = tosa.matmul %1728, %1730 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1732 = tosa.reshape %1731 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1733 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1734 = tosa.transpose %1732, %1733 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %1735 = tosa.reshape %1734 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %1736 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1737 = tosa.transpose %arg133, %1736 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1738 = tosa.reshape %1735 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_404 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1739 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1738, %1737 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_404 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1740 = tosa.reshape %1739 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1741 = tosa.add %1657, %1740 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1742 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_405 = arith.constant 2 : i32
-    %1743 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1741 : tensor<1x40x4096xf32>) outs(%1742 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_405 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1744 = tosa.reduce_sum %1743 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1745 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1746 = tosa.reciprocal %1745 : (tensor<1xf32>) -> tensor<1xf32>
-    %1747 = tosa.mul %1746, %1744 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1748 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1749 = tosa.add %1747, %1748 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1750 = tosa.rsqrt %1749 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1751 = tosa.mul %1741, %1750 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1752 = tosa.reshape %arg134 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1753 = tosa.mul %1752, %1751 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1754 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1755 = tosa.transpose %arg135, %1754 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1756 = tosa.reshape %1753 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_406 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1757 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1756, %1755 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_406 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1758 = tosa.reshape %1757 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1759 = tosa.sigmoid %1758 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1760 = tosa.mul %1758, %1759 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1761 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1762 = tosa.transpose %arg136, %1761 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1763 = tosa.reshape %1753 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_407 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1764 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1763, %1762 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_407 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1765 = tosa.reshape %1764 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1766 = tosa.mul %1760, %1765 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1767 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1768 = tosa.transpose %arg137, %1767 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %1769 = tosa.reshape %1766 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_408 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1770 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1769, %1768 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_408 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1771 = tosa.reshape %1770 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1772 = tosa.add %1741, %1771 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1773 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_409 = arith.constant 2 : i32
-    %1774 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1772 : tensor<1x40x4096xf32>) outs(%1773 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_409 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1775 = tosa.reduce_sum %1774 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1776 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1777 = tosa.reciprocal %1776 : (tensor<1xf32>) -> tensor<1xf32>
-    %1778 = tosa.mul %1777, %1775 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1779 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1780 = tosa.add %1778, %1779 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1781 = tosa.rsqrt %1780 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1782 = tosa.mul %1772, %1781 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1783 = tosa.reshape %arg138 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1784 = tosa.mul %1783, %1782 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1785 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1786 = tosa.transpose %arg139, %1785 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1787 = tosa.reshape %1784 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_410 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1788 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1787, %1786 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_410 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1789 = tosa.reshape %1788 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1790 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1791 = tosa.transpose %arg140, %1790 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1792 = tosa.reshape %1784 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_411 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1793 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1792, %1791 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_411 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1794 = tosa.reshape %1793 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1795 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1796 = tosa.transpose %arg141, %1795 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1797 = tosa.reshape %1784 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_412 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1798 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1797, %1796 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_412 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1799 = tosa.reshape %1798 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1800 = tosa.reshape %1789 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1801 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1802 = tosa.transpose %1800, %1801 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1803 = tosa.reshape %1794 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1804 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1805 = tosa.transpose %1803, %1804 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1806 = tosa.reshape %1799 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1807 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1808 = tosa.transpose %1806, %1807 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1809 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1810 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1811 = tosa.mul %1802, %1809 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_413 = tensor.extract_slice %1802[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_414 = tensor.extract_slice %1802[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1812 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1813 = linalg.negf ins(%extracted_slice_414 : tensor<1x32x40x64xf32>) outs(%1812 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1814 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_415 = tensor.insert_slice %1813 into %1814[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_416 = tensor.insert_slice %extracted_slice_413 into %inserted_slice_415[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1815 = tosa.mul %inserted_slice_416, %1810 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1816 = tosa.add %1811, %1815 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1817 = tosa.mul %1805, %1809 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_417 = tensor.extract_slice %1805[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_418 = tensor.extract_slice %1805[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1818 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1819 = linalg.negf ins(%extracted_slice_418 : tensor<1x32x40x64xf32>) outs(%1818 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1820 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_419 = tensor.insert_slice %1819 into %1820[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_420 = tensor.insert_slice %extracted_slice_417 into %inserted_slice_419[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1821 = tosa.mul %inserted_slice_420, %1810 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1822 = tosa.add %1817, %1821 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1823 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %1824 = tosa.reshape %1823 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_421 = tensor.extract_slice %1824[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_422 = tensor.extract_slice %extracted_slice_421[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %1825 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %1826 = tosa.add %extracted_slice_422, %1825 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_423 = tensor.extract_slice %1826[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_424 = tensor.extract_slice %extracted_slice_423[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_425 = tensor.extract_slice %extracted_slice_424[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_426 = tensor.extract_slice %extracted_slice_425[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_427 = arith.constant 0.000000e+00 : f32
-    %splat_428 = tensor.splat %cst_427 : tensor<40x40xf32>
-    %1827 = tosa.reshape %extracted_slice_426 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %1828 = tosa.add %splat_428, %1827 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %1829 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1830 = tosa.transpose %1822, %1829 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %1831 = tosa.reshape %1816 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1832 = tosa.reshape %1830 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %1833 = tosa.matmul %1831, %1832 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_429 = arith.constant 0.0883883461 : f32
-    %splat_430 = tensor.splat %cst_429 : tensor<32x40x40xf32>
-    %1834 = tosa.mul %1833, %splat_430 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %1835 = tosa.add %1834, %1828 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %1836 = tosa.reduce_max %1835 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1837 = tosa.sub %1835, %1836 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1838 = math.exp %1837 : tensor<32x40x40xf32>
-    %1839 = tosa.reduce_sum %1838 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1840 = tosa.log %1839 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1841 = tosa.add %1836, %1840 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1842 = tosa.sub %1835, %1841 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1843 = math.exp %1842 : tensor<32x40x40xf32>
-    %1844 = tosa.reshape %1841 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %1845 = tosa.reshape %1808 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1846 = tosa.matmul %1843, %1845 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1847 = tosa.reshape %1846 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1848 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1849 = tosa.transpose %1847, %1848 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %1850 = tosa.reshape %1849 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %1851 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1852 = tosa.transpose %arg142, %1851 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1853 = tosa.reshape %1850 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_431 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1854 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1853, %1852 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_431 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1855 = tosa.reshape %1854 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1856 = tosa.add %1772, %1855 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1857 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_432 = arith.constant 2 : i32
-    %1858 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1856 : tensor<1x40x4096xf32>) outs(%1857 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_432 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1859 = tosa.reduce_sum %1858 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1860 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1861 = tosa.reciprocal %1860 : (tensor<1xf32>) -> tensor<1xf32>
-    %1862 = tosa.mul %1861, %1859 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1863 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1864 = tosa.add %1862, %1863 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1865 = tosa.rsqrt %1864 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1866 = tosa.mul %1856, %1865 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1867 = tosa.reshape %arg143 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1868 = tosa.mul %1867, %1866 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1869 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1870 = tosa.transpose %arg144, %1869 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1871 = tosa.reshape %1868 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_433 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1872 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1871, %1870 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_433 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1873 = tosa.reshape %1872 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1874 = tosa.sigmoid %1873 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1875 = tosa.mul %1873, %1874 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1876 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1877 = tosa.transpose %arg145, %1876 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1878 = tosa.reshape %1868 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_434 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1879 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1878, %1877 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_434 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1880 = tosa.reshape %1879 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1881 = tosa.mul %1875, %1880 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1882 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1883 = tosa.transpose %arg146, %1882 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %1884 = tosa.reshape %1881 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_435 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1885 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1884, %1883 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_435 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1886 = tosa.reshape %1885 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1887 = tosa.add %1856, %1886 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1888 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_436 = arith.constant 2 : i32
-    %1889 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1887 : tensor<1x40x4096xf32>) outs(%1888 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_436 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1890 = tosa.reduce_sum %1889 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1891 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1892 = tosa.reciprocal %1891 : (tensor<1xf32>) -> tensor<1xf32>
-    %1893 = tosa.mul %1892, %1890 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1894 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1895 = tosa.add %1893, %1894 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1896 = tosa.rsqrt %1895 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1897 = tosa.mul %1887, %1896 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1898 = tosa.reshape %arg147 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1899 = tosa.mul %1898, %1897 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1900 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1901 = tosa.transpose %arg148, %1900 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1902 = tosa.reshape %1899 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_437 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1903 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1902, %1901 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_437 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1904 = tosa.reshape %1903 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1905 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1906 = tosa.transpose %arg149, %1905 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1907 = tosa.reshape %1899 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_438 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1908 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1907, %1906 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_438 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1909 = tosa.reshape %1908 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1910 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1911 = tosa.transpose %arg150, %1910 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1912 = tosa.reshape %1899 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_439 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1913 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1912, %1911 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_439 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1914 = tosa.reshape %1913 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1915 = tosa.reshape %1904 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1916 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1917 = tosa.transpose %1915, %1916 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1918 = tosa.reshape %1909 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1919 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1920 = tosa.transpose %1918, %1919 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1921 = tosa.reshape %1914 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %1922 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1923 = tosa.transpose %1921, %1922 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %1924 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1925 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %1926 = tosa.mul %1917, %1924 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_440 = tensor.extract_slice %1917[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_441 = tensor.extract_slice %1917[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1927 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1928 = linalg.negf ins(%extracted_slice_441 : tensor<1x32x40x64xf32>) outs(%1927 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1929 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_442 = tensor.insert_slice %1928 into %1929[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_443 = tensor.insert_slice %extracted_slice_440 into %inserted_slice_442[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1930 = tosa.mul %inserted_slice_443, %1925 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1931 = tosa.add %1926, %1930 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1932 = tosa.mul %1920, %1924 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_444 = tensor.extract_slice %1920[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_445 = tensor.extract_slice %1920[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %1933 = tensor.empty() : tensor<1x32x40x64xf32>
-    %1934 = linalg.negf ins(%extracted_slice_445 : tensor<1x32x40x64xf32>) outs(%1933 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %1935 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_446 = tensor.insert_slice %1934 into %1935[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_447 = tensor.insert_slice %extracted_slice_444 into %inserted_slice_446[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %1936 = tosa.mul %inserted_slice_447, %1925 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1937 = tosa.add %1932, %1936 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1938 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %1939 = tosa.reshape %1938 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_448 = tensor.extract_slice %1939[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_449 = tensor.extract_slice %extracted_slice_448[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %1940 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %1941 = tosa.add %extracted_slice_449, %1940 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_450 = tensor.extract_slice %1941[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_451 = tensor.extract_slice %extracted_slice_450[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_452 = tensor.extract_slice %extracted_slice_451[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_453 = tensor.extract_slice %extracted_slice_452[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_454 = arith.constant 0.000000e+00 : f32
-    %splat_455 = tensor.splat %cst_454 : tensor<40x40xf32>
-    %1942 = tosa.reshape %extracted_slice_453 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %1943 = tosa.add %splat_455, %1942 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %1944 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1945 = tosa.transpose %1937, %1944 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %1946 = tosa.reshape %1931 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1947 = tosa.reshape %1945 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %1948 = tosa.matmul %1946, %1947 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_456 = arith.constant 0.0883883461 : f32
-    %splat_457 = tensor.splat %cst_456 : tensor<32x40x40xf32>
-    %1949 = tosa.mul %1948, %splat_457 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %1950 = tosa.add %1949, %1943 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %1951 = tosa.reduce_max %1950 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1952 = tosa.sub %1950, %1951 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1953 = math.exp %1952 : tensor<32x40x40xf32>
-    %1954 = tosa.reduce_sum %1953 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %1955 = tosa.log %1954 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1956 = tosa.add %1951, %1955 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %1957 = tosa.sub %1950, %1956 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %1958 = math.exp %1957 : tensor<32x40x40xf32>
-    %1959 = tosa.reshape %1956 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %1960 = tosa.reshape %1923 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1961 = tosa.matmul %1958, %1960 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %1962 = tosa.reshape %1961 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %1963 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %1964 = tosa.transpose %1962, %1963 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %1965 = tosa.reshape %1964 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %1966 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1967 = tosa.transpose %arg151, %1966 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %1968 = tosa.reshape %1965 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_458 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %1969 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1968, %1967 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_458 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %1970 = tosa.reshape %1969 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1971 = tosa.add %1887, %1970 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1972 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_459 = arith.constant 2 : i32
-    %1973 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1971 : tensor<1x40x4096xf32>) outs(%1972 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_459 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %1974 = tosa.reduce_sum %1973 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %1975 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %1976 = tosa.reciprocal %1975 : (tensor<1xf32>) -> tensor<1xf32>
-    %1977 = tosa.mul %1976, %1974 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1978 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %1979 = tosa.add %1977, %1978 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1980 = tosa.rsqrt %1979 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %1981 = tosa.mul %1971, %1980 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %1982 = tosa.reshape %arg152 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %1983 = tosa.mul %1982, %1981 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %1984 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1985 = tosa.transpose %arg153, %1984 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1986 = tosa.reshape %1983 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_460 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1987 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1986, %1985 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_460 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1988 = tosa.reshape %1987 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1989 = tosa.sigmoid %1988 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1990 = tosa.mul %1988, %1989 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1991 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1992 = tosa.transpose %arg154, %1991 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %1993 = tosa.reshape %1983 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_461 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %1994 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1993, %1992 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_461 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %1995 = tosa.reshape %1994 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1996 = tosa.mul %1990, %1995 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %1997 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %1998 = tosa.transpose %arg155, %1997 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %1999 = tosa.reshape %1996 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_462 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2000 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%1999, %1998 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_462 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2001 = tosa.reshape %2000 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2002 = tosa.add %1971, %2001 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2003 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_463 = arith.constant 2 : i32
-    %2004 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2002 : tensor<1x40x4096xf32>) outs(%2003 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_463 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2005 = tosa.reduce_sum %2004 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2006 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2007 = tosa.reciprocal %2006 : (tensor<1xf32>) -> tensor<1xf32>
-    %2008 = tosa.mul %2007, %2005 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2009 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2010 = tosa.add %2008, %2009 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2011 = tosa.rsqrt %2010 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2012 = tosa.mul %2002, %2011 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2013 = tosa.reshape %arg156 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2014 = tosa.mul %2013, %2012 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2015 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2016 = tosa.transpose %arg157, %2015 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2017 = tosa.reshape %2014 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_464 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2018 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2017, %2016 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_464 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2019 = tosa.reshape %2018 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2020 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2021 = tosa.transpose %arg158, %2020 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2022 = tosa.reshape %2014 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_465 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2023 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2022, %2021 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_465 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2024 = tosa.reshape %2023 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2025 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2026 = tosa.transpose %arg159, %2025 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2027 = tosa.reshape %2014 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_466 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2028 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2027, %2026 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_466 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2029 = tosa.reshape %2028 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2030 = tosa.reshape %2019 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2031 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2032 = tosa.transpose %2030, %2031 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2033 = tosa.reshape %2024 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2034 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2035 = tosa.transpose %2033, %2034 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2036 = tosa.reshape %2029 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2037 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2038 = tosa.transpose %2036, %2037 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2039 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2040 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2041 = tosa.mul %2032, %2039 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_467 = tensor.extract_slice %2032[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_468 = tensor.extract_slice %2032[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2042 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2043 = linalg.negf ins(%extracted_slice_468 : tensor<1x32x40x64xf32>) outs(%2042 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2044 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_469 = tensor.insert_slice %2043 into %2044[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_470 = tensor.insert_slice %extracted_slice_467 into %inserted_slice_469[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2045 = tosa.mul %inserted_slice_470, %2040 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2046 = tosa.add %2041, %2045 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2047 = tosa.mul %2035, %2039 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_471 = tensor.extract_slice %2035[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_472 = tensor.extract_slice %2035[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2048 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2049 = linalg.negf ins(%extracted_slice_472 : tensor<1x32x40x64xf32>) outs(%2048 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2050 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_473 = tensor.insert_slice %2049 into %2050[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_474 = tensor.insert_slice %extracted_slice_471 into %inserted_slice_473[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2051 = tosa.mul %inserted_slice_474, %2040 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2052 = tosa.add %2047, %2051 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2053 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %2054 = tosa.reshape %2053 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_475 = tensor.extract_slice %2054[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_476 = tensor.extract_slice %extracted_slice_475[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %2055 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %2056 = tosa.add %extracted_slice_476, %2055 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_477 = tensor.extract_slice %2056[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_478 = tensor.extract_slice %extracted_slice_477[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_479 = tensor.extract_slice %extracted_slice_478[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_480 = tensor.extract_slice %extracted_slice_479[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_481 = arith.constant 0.000000e+00 : f32
-    %splat_482 = tensor.splat %cst_481 : tensor<40x40xf32>
-    %2057 = tosa.reshape %extracted_slice_480 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %2058 = tosa.add %splat_482, %2057 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %2059 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2060 = tosa.transpose %2052, %2059 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %2061 = tosa.reshape %2046 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2062 = tosa.reshape %2060 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %2063 = tosa.matmul %2061, %2062 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_483 = arith.constant 0.0883883461 : f32
-    %splat_484 = tensor.splat %cst_483 : tensor<32x40x40xf32>
-    %2064 = tosa.mul %2063, %splat_484 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %2065 = tosa.add %2064, %2058 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %2066 = tosa.reduce_max %2065 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2067 = tosa.sub %2065, %2066 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2068 = math.exp %2067 : tensor<32x40x40xf32>
-    %2069 = tosa.reduce_sum %2068 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2070 = tosa.log %2069 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2071 = tosa.add %2066, %2070 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2072 = tosa.sub %2065, %2071 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2073 = math.exp %2072 : tensor<32x40x40xf32>
-    %2074 = tosa.reshape %2071 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %2075 = tosa.reshape %2038 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2076 = tosa.matmul %2073, %2075 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2077 = tosa.reshape %2076 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2078 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2079 = tosa.transpose %2077, %2078 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %2080 = tosa.reshape %2079 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %2081 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2082 = tosa.transpose %arg160, %2081 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2083 = tosa.reshape %2080 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_485 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2084 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2083, %2082 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_485 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2085 = tosa.reshape %2084 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2086 = tosa.add %2002, %2085 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2087 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_486 = arith.constant 2 : i32
-    %2088 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2086 : tensor<1x40x4096xf32>) outs(%2087 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_486 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2089 = tosa.reduce_sum %2088 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2090 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2091 = tosa.reciprocal %2090 : (tensor<1xf32>) -> tensor<1xf32>
-    %2092 = tosa.mul %2091, %2089 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2093 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2094 = tosa.add %2092, %2093 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2095 = tosa.rsqrt %2094 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2096 = tosa.mul %2086, %2095 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2097 = tosa.reshape %arg161 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2098 = tosa.mul %2097, %2096 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2099 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2100 = tosa.transpose %arg162, %2099 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2101 = tosa.reshape %2098 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_487 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2102 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2101, %2100 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_487 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2103 = tosa.reshape %2102 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2104 = tosa.sigmoid %2103 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2105 = tosa.mul %2103, %2104 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2106 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2107 = tosa.transpose %arg163, %2106 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2108 = tosa.reshape %2098 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_488 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2109 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2108, %2107 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_488 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2110 = tosa.reshape %2109 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2111 = tosa.mul %2105, %2110 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2112 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2113 = tosa.transpose %arg164, %2112 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %2114 = tosa.reshape %2111 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_489 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2115 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2114, %2113 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_489 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2116 = tosa.reshape %2115 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2117 = tosa.add %2086, %2116 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2118 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_490 = arith.constant 2 : i32
-    %2119 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2117 : tensor<1x40x4096xf32>) outs(%2118 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_490 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2120 = tosa.reduce_sum %2119 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2121 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2122 = tosa.reciprocal %2121 : (tensor<1xf32>) -> tensor<1xf32>
-    %2123 = tosa.mul %2122, %2120 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2124 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2125 = tosa.add %2123, %2124 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2126 = tosa.rsqrt %2125 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2127 = tosa.mul %2117, %2126 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2128 = tosa.reshape %arg165 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2129 = tosa.mul %2128, %2127 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2130 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2131 = tosa.transpose %arg166, %2130 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2132 = tosa.reshape %2129 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_491 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2133 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2132, %2131 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_491 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2134 = tosa.reshape %2133 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2135 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2136 = tosa.transpose %arg167, %2135 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2137 = tosa.reshape %2129 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_492 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2138 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2137, %2136 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_492 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2139 = tosa.reshape %2138 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2140 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2141 = tosa.transpose %arg168, %2140 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2142 = tosa.reshape %2129 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_493 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2143 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2142, %2141 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_493 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2144 = tosa.reshape %2143 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2145 = tosa.reshape %2134 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2146 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2147 = tosa.transpose %2145, %2146 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2148 = tosa.reshape %2139 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2149 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2150 = tosa.transpose %2148, %2149 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2151 = tosa.reshape %2144 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2152 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2153 = tosa.transpose %2151, %2152 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2154 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2155 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2156 = tosa.mul %2147, %2154 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_494 = tensor.extract_slice %2147[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_495 = tensor.extract_slice %2147[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2157 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2158 = linalg.negf ins(%extracted_slice_495 : tensor<1x32x40x64xf32>) outs(%2157 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2159 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_496 = tensor.insert_slice %2158 into %2159[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_497 = tensor.insert_slice %extracted_slice_494 into %inserted_slice_496[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2160 = tosa.mul %inserted_slice_497, %2155 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2161 = tosa.add %2156, %2160 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2162 = tosa.mul %2150, %2154 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_498 = tensor.extract_slice %2150[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_499 = tensor.extract_slice %2150[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2163 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2164 = linalg.negf ins(%extracted_slice_499 : tensor<1x32x40x64xf32>) outs(%2163 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2165 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_500 = tensor.insert_slice %2164 into %2165[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_501 = tensor.insert_slice %extracted_slice_498 into %inserted_slice_500[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2166 = tosa.mul %inserted_slice_501, %2155 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2167 = tosa.add %2162, %2166 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2168 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %2169 = tosa.reshape %2168 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_502 = tensor.extract_slice %2169[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_503 = tensor.extract_slice %extracted_slice_502[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %2170 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %2171 = tosa.add %extracted_slice_503, %2170 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_504 = tensor.extract_slice %2171[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_505 = tensor.extract_slice %extracted_slice_504[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_506 = tensor.extract_slice %extracted_slice_505[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_507 = tensor.extract_slice %extracted_slice_506[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_508 = arith.constant 0.000000e+00 : f32
-    %splat_509 = tensor.splat %cst_508 : tensor<40x40xf32>
-    %2172 = tosa.reshape %extracted_slice_507 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %2173 = tosa.add %splat_509, %2172 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %2174 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2175 = tosa.transpose %2167, %2174 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %2176 = tosa.reshape %2161 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2177 = tosa.reshape %2175 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %2178 = tosa.matmul %2176, %2177 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_510 = arith.constant 0.0883883461 : f32
-    %splat_511 = tensor.splat %cst_510 : tensor<32x40x40xf32>
-    %2179 = tosa.mul %2178, %splat_511 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %2180 = tosa.add %2179, %2173 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %2181 = tosa.reduce_max %2180 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2182 = tosa.sub %2180, %2181 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2183 = math.exp %2182 : tensor<32x40x40xf32>
-    %2184 = tosa.reduce_sum %2183 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2185 = tosa.log %2184 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2186 = tosa.add %2181, %2185 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2187 = tosa.sub %2180, %2186 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2188 = math.exp %2187 : tensor<32x40x40xf32>
-    %2189 = tosa.reshape %2186 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %2190 = tosa.reshape %2153 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2191 = tosa.matmul %2188, %2190 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2192 = tosa.reshape %2191 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2193 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2194 = tosa.transpose %2192, %2193 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %2195 = tosa.reshape %2194 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %2196 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2197 = tosa.transpose %arg169, %2196 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2198 = tosa.reshape %2195 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_512 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2199 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2198, %2197 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_512 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2200 = tosa.reshape %2199 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2201 = tosa.add %2117, %2200 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2202 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_513 = arith.constant 2 : i32
-    %2203 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2201 : tensor<1x40x4096xf32>) outs(%2202 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_513 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2204 = tosa.reduce_sum %2203 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2205 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2206 = tosa.reciprocal %2205 : (tensor<1xf32>) -> tensor<1xf32>
-    %2207 = tosa.mul %2206, %2204 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2208 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2209 = tosa.add %2207, %2208 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2210 = tosa.rsqrt %2209 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2211 = tosa.mul %2201, %2210 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2212 = tosa.reshape %arg170 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2213 = tosa.mul %2212, %2211 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2214 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2215 = tosa.transpose %arg171, %2214 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2216 = tosa.reshape %2213 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_514 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2217 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2216, %2215 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_514 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2218 = tosa.reshape %2217 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2219 = tosa.sigmoid %2218 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2220 = tosa.mul %2218, %2219 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2221 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2222 = tosa.transpose %arg172, %2221 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2223 = tosa.reshape %2213 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_515 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2224 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2223, %2222 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_515 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2225 = tosa.reshape %2224 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2226 = tosa.mul %2220, %2225 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2227 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2228 = tosa.transpose %arg173, %2227 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %2229 = tosa.reshape %2226 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_516 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2230 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2229, %2228 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_516 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2231 = tosa.reshape %2230 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2232 = tosa.add %2201, %2231 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2233 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_517 = arith.constant 2 : i32
-    %2234 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2232 : tensor<1x40x4096xf32>) outs(%2233 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_517 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2235 = tosa.reduce_sum %2234 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2236 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2237 = tosa.reciprocal %2236 : (tensor<1xf32>) -> tensor<1xf32>
-    %2238 = tosa.mul %2237, %2235 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2239 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2240 = tosa.add %2238, %2239 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2241 = tosa.rsqrt %2240 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2242 = tosa.mul %2232, %2241 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2243 = tosa.reshape %arg174 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2244 = tosa.mul %2243, %2242 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2245 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2246 = tosa.transpose %arg175, %2245 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2247 = tosa.reshape %2244 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_518 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2248 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2247, %2246 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_518 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2249 = tosa.reshape %2248 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2250 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2251 = tosa.transpose %arg176, %2250 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2252 = tosa.reshape %2244 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_519 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2253 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2252, %2251 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_519 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2254 = tosa.reshape %2253 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2255 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2256 = tosa.transpose %arg177, %2255 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2257 = tosa.reshape %2244 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_520 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2258 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2257, %2256 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_520 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2259 = tosa.reshape %2258 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2260 = tosa.reshape %2249 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2261 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2262 = tosa.transpose %2260, %2261 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2263 = tosa.reshape %2254 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2264 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2265 = tosa.transpose %2263, %2264 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2266 = tosa.reshape %2259 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2267 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2268 = tosa.transpose %2266, %2267 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2269 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2270 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2271 = tosa.mul %2262, %2269 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_521 = tensor.extract_slice %2262[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_522 = tensor.extract_slice %2262[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2272 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2273 = linalg.negf ins(%extracted_slice_522 : tensor<1x32x40x64xf32>) outs(%2272 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2274 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_523 = tensor.insert_slice %2273 into %2274[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_524 = tensor.insert_slice %extracted_slice_521 into %inserted_slice_523[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2275 = tosa.mul %inserted_slice_524, %2270 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2276 = tosa.add %2271, %2275 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2277 = tosa.mul %2265, %2269 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_525 = tensor.extract_slice %2265[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_526 = tensor.extract_slice %2265[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2278 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2279 = linalg.negf ins(%extracted_slice_526 : tensor<1x32x40x64xf32>) outs(%2278 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2280 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_527 = tensor.insert_slice %2279 into %2280[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_528 = tensor.insert_slice %extracted_slice_525 into %inserted_slice_527[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2281 = tosa.mul %inserted_slice_528, %2270 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2282 = tosa.add %2277, %2281 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2283 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %2284 = tosa.reshape %2283 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_529 = tensor.extract_slice %2284[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_530 = tensor.extract_slice %extracted_slice_529[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %2285 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %2286 = tosa.add %extracted_slice_530, %2285 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_531 = tensor.extract_slice %2286[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_532 = tensor.extract_slice %extracted_slice_531[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_533 = tensor.extract_slice %extracted_slice_532[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_534 = tensor.extract_slice %extracted_slice_533[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_535 = arith.constant 0.000000e+00 : f32
-    %splat_536 = tensor.splat %cst_535 : tensor<40x40xf32>
-    %2287 = tosa.reshape %extracted_slice_534 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %2288 = tosa.add %splat_536, %2287 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %2289 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2290 = tosa.transpose %2282, %2289 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %2291 = tosa.reshape %2276 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2292 = tosa.reshape %2290 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %2293 = tosa.matmul %2291, %2292 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_537 = arith.constant 0.0883883461 : f32
-    %splat_538 = tensor.splat %cst_537 : tensor<32x40x40xf32>
-    %2294 = tosa.mul %2293, %splat_538 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %2295 = tosa.add %2294, %2288 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %2296 = tosa.reduce_max %2295 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2297 = tosa.sub %2295, %2296 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2298 = math.exp %2297 : tensor<32x40x40xf32>
-    %2299 = tosa.reduce_sum %2298 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2300 = tosa.log %2299 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2301 = tosa.add %2296, %2300 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2302 = tosa.sub %2295, %2301 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2303 = math.exp %2302 : tensor<32x40x40xf32>
-    %2304 = tosa.reshape %2301 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %2305 = tosa.reshape %2268 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2306 = tosa.matmul %2303, %2305 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2307 = tosa.reshape %2306 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2308 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2309 = tosa.transpose %2307, %2308 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %2310 = tosa.reshape %2309 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %2311 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2312 = tosa.transpose %arg178, %2311 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2313 = tosa.reshape %2310 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_539 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2314 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2313, %2312 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_539 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2315 = tosa.reshape %2314 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2316 = tosa.add %2232, %2315 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2317 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_540 = arith.constant 2 : i32
-    %2318 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2316 : tensor<1x40x4096xf32>) outs(%2317 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_540 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2319 = tosa.reduce_sum %2318 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2320 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2321 = tosa.reciprocal %2320 : (tensor<1xf32>) -> tensor<1xf32>
-    %2322 = tosa.mul %2321, %2319 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2323 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2324 = tosa.add %2322, %2323 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2325 = tosa.rsqrt %2324 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2326 = tosa.mul %2316, %2325 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2327 = tosa.reshape %arg179 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2328 = tosa.mul %2327, %2326 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2329 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2330 = tosa.transpose %arg180, %2329 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2331 = tosa.reshape %2328 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_541 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2332 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2331, %2330 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_541 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2333 = tosa.reshape %2332 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2334 = tosa.sigmoid %2333 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2335 = tosa.mul %2333, %2334 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2336 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2337 = tosa.transpose %arg181, %2336 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2338 = tosa.reshape %2328 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_542 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2339 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2338, %2337 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_542 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2340 = tosa.reshape %2339 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2341 = tosa.mul %2335, %2340 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2342 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2343 = tosa.transpose %arg182, %2342 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %2344 = tosa.reshape %2341 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_543 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2345 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2344, %2343 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_543 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2346 = tosa.reshape %2345 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2347 = tosa.add %2316, %2346 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2348 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_544 = arith.constant 2 : i32
-    %2349 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2347 : tensor<1x40x4096xf32>) outs(%2348 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_544 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2350 = tosa.reduce_sum %2349 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2351 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2352 = tosa.reciprocal %2351 : (tensor<1xf32>) -> tensor<1xf32>
-    %2353 = tosa.mul %2352, %2350 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2354 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2355 = tosa.add %2353, %2354 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2356 = tosa.rsqrt %2355 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2357 = tosa.mul %2347, %2356 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2358 = tosa.reshape %arg183 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2359 = tosa.mul %2358, %2357 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2360 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2361 = tosa.transpose %arg184, %2360 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2362 = tosa.reshape %2359 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_545 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2363 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2362, %2361 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_545 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2364 = tosa.reshape %2363 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2365 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2366 = tosa.transpose %arg185, %2365 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2367 = tosa.reshape %2359 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_546 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2368 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2367, %2366 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_546 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2369 = tosa.reshape %2368 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2370 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2371 = tosa.transpose %arg186, %2370 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2372 = tosa.reshape %2359 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_547 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2373 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2372, %2371 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_547 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2374 = tosa.reshape %2373 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2375 = tosa.reshape %2364 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2376 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2377 = tosa.transpose %2375, %2376 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2378 = tosa.reshape %2369 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2379 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2380 = tosa.transpose %2378, %2379 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2381 = tosa.reshape %2374 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2382 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2383 = tosa.transpose %2381, %2382 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2384 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2385 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2386 = tosa.mul %2377, %2384 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_548 = tensor.extract_slice %2377[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_549 = tensor.extract_slice %2377[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2387 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2388 = linalg.negf ins(%extracted_slice_549 : tensor<1x32x40x64xf32>) outs(%2387 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2389 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_550 = tensor.insert_slice %2388 into %2389[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_551 = tensor.insert_slice %extracted_slice_548 into %inserted_slice_550[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2390 = tosa.mul %inserted_slice_551, %2385 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2391 = tosa.add %2386, %2390 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2392 = tosa.mul %2380, %2384 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_552 = tensor.extract_slice %2380[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_553 = tensor.extract_slice %2380[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2393 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2394 = linalg.negf ins(%extracted_slice_553 : tensor<1x32x40x64xf32>) outs(%2393 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2395 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_554 = tensor.insert_slice %2394 into %2395[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_555 = tensor.insert_slice %extracted_slice_552 into %inserted_slice_554[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2396 = tosa.mul %inserted_slice_555, %2385 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2397 = tosa.add %2392, %2396 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2398 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %2399 = tosa.reshape %2398 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_556 = tensor.extract_slice %2399[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_557 = tensor.extract_slice %extracted_slice_556[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %2400 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %2401 = tosa.add %extracted_slice_557, %2400 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_558 = tensor.extract_slice %2401[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_559 = tensor.extract_slice %extracted_slice_558[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_560 = tensor.extract_slice %extracted_slice_559[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_561 = tensor.extract_slice %extracted_slice_560[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_562 = arith.constant 0.000000e+00 : f32
-    %splat_563 = tensor.splat %cst_562 : tensor<40x40xf32>
-    %2402 = tosa.reshape %extracted_slice_561 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %2403 = tosa.add %splat_563, %2402 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %2404 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2405 = tosa.transpose %2397, %2404 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %2406 = tosa.reshape %2391 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2407 = tosa.reshape %2405 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %2408 = tosa.matmul %2406, %2407 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_564 = arith.constant 0.0883883461 : f32
-    %splat_565 = tensor.splat %cst_564 : tensor<32x40x40xf32>
-    %2409 = tosa.mul %2408, %splat_565 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %2410 = tosa.add %2409, %2403 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %2411 = tosa.reduce_max %2410 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2412 = tosa.sub %2410, %2411 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2413 = math.exp %2412 : tensor<32x40x40xf32>
-    %2414 = tosa.reduce_sum %2413 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2415 = tosa.log %2414 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2416 = tosa.add %2411, %2415 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2417 = tosa.sub %2410, %2416 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2418 = math.exp %2417 : tensor<32x40x40xf32>
-    %2419 = tosa.reshape %2416 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %2420 = tosa.reshape %2383 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2421 = tosa.matmul %2418, %2420 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2422 = tosa.reshape %2421 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2423 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2424 = tosa.transpose %2422, %2423 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %2425 = tosa.reshape %2424 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %2426 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2427 = tosa.transpose %arg187, %2426 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2428 = tosa.reshape %2425 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_566 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2429 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2428, %2427 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_566 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2430 = tosa.reshape %2429 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2431 = tosa.add %2347, %2430 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2432 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_567 = arith.constant 2 : i32
-    %2433 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2431 : tensor<1x40x4096xf32>) outs(%2432 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_567 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2434 = tosa.reduce_sum %2433 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2435 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2436 = tosa.reciprocal %2435 : (tensor<1xf32>) -> tensor<1xf32>
-    %2437 = tosa.mul %2436, %2434 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2438 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2439 = tosa.add %2437, %2438 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2440 = tosa.rsqrt %2439 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2441 = tosa.mul %2431, %2440 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2442 = tosa.reshape %arg188 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2443 = tosa.mul %2442, %2441 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2444 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2445 = tosa.transpose %arg189, %2444 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2446 = tosa.reshape %2443 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_568 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2447 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2446, %2445 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_568 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2448 = tosa.reshape %2447 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2449 = tosa.sigmoid %2448 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2450 = tosa.mul %2448, %2449 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2451 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2452 = tosa.transpose %arg190, %2451 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2453 = tosa.reshape %2443 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_569 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2454 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2453, %2452 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_569 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2455 = tosa.reshape %2454 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2456 = tosa.mul %2450, %2455 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2457 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2458 = tosa.transpose %arg191, %2457 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %2459 = tosa.reshape %2456 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_570 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2460 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2459, %2458 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_570 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2461 = tosa.reshape %2460 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2462 = tosa.add %2431, %2461 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2463 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_571 = arith.constant 2 : i32
-    %2464 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2462 : tensor<1x40x4096xf32>) outs(%2463 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_571 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2465 = tosa.reduce_sum %2464 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2466 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2467 = tosa.reciprocal %2466 : (tensor<1xf32>) -> tensor<1xf32>
-    %2468 = tosa.mul %2467, %2465 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2469 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2470 = tosa.add %2468, %2469 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2471 = tosa.rsqrt %2470 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2472 = tosa.mul %2462, %2471 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2473 = tosa.reshape %arg192 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2474 = tosa.mul %2473, %2472 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2475 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2476 = tosa.transpose %arg193, %2475 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2477 = tosa.reshape %2474 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_572 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2478 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2477, %2476 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_572 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2479 = tosa.reshape %2478 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2480 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2481 = tosa.transpose %arg194, %2480 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2482 = tosa.reshape %2474 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_573 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2483 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2482, %2481 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_573 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2484 = tosa.reshape %2483 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2485 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2486 = tosa.transpose %arg195, %2485 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2487 = tosa.reshape %2474 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_574 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2488 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2487, %2486 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_574 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2489 = tosa.reshape %2488 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2490 = tosa.reshape %2479 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2491 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2492 = tosa.transpose %2490, %2491 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2493 = tosa.reshape %2484 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2494 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2495 = tosa.transpose %2493, %2494 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2496 = tosa.reshape %2489 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2497 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2498 = tosa.transpose %2496, %2497 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2499 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2500 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2501 = tosa.mul %2492, %2499 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_575 = tensor.extract_slice %2492[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_576 = tensor.extract_slice %2492[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2502 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2503 = linalg.negf ins(%extracted_slice_576 : tensor<1x32x40x64xf32>) outs(%2502 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2504 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_577 = tensor.insert_slice %2503 into %2504[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_578 = tensor.insert_slice %extracted_slice_575 into %inserted_slice_577[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2505 = tosa.mul %inserted_slice_578, %2500 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2506 = tosa.add %2501, %2505 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2507 = tosa.mul %2495, %2499 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_579 = tensor.extract_slice %2495[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_580 = tensor.extract_slice %2495[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2508 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2509 = linalg.negf ins(%extracted_slice_580 : tensor<1x32x40x64xf32>) outs(%2508 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2510 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_581 = tensor.insert_slice %2509 into %2510[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_582 = tensor.insert_slice %extracted_slice_579 into %inserted_slice_581[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2511 = tosa.mul %inserted_slice_582, %2500 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2512 = tosa.add %2507, %2511 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2513 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %2514 = tosa.reshape %2513 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_583 = tensor.extract_slice %2514[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_584 = tensor.extract_slice %extracted_slice_583[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %2515 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %2516 = tosa.add %extracted_slice_584, %2515 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_585 = tensor.extract_slice %2516[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_586 = tensor.extract_slice %extracted_slice_585[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_587 = tensor.extract_slice %extracted_slice_586[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_588 = tensor.extract_slice %extracted_slice_587[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_589 = arith.constant 0.000000e+00 : f32
-    %splat_590 = tensor.splat %cst_589 : tensor<40x40xf32>
-    %2517 = tosa.reshape %extracted_slice_588 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %2518 = tosa.add %splat_590, %2517 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %2519 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2520 = tosa.transpose %2512, %2519 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %2521 = tosa.reshape %2506 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2522 = tosa.reshape %2520 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %2523 = tosa.matmul %2521, %2522 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_591 = arith.constant 0.0883883461 : f32
-    %splat_592 = tensor.splat %cst_591 : tensor<32x40x40xf32>
-    %2524 = tosa.mul %2523, %splat_592 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %2525 = tosa.add %2524, %2518 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %2526 = tosa.reduce_max %2525 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2527 = tosa.sub %2525, %2526 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2528 = math.exp %2527 : tensor<32x40x40xf32>
-    %2529 = tosa.reduce_sum %2528 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2530 = tosa.log %2529 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2531 = tosa.add %2526, %2530 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2532 = tosa.sub %2525, %2531 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2533 = math.exp %2532 : tensor<32x40x40xf32>
-    %2534 = tosa.reshape %2531 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %2535 = tosa.reshape %2498 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2536 = tosa.matmul %2533, %2535 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2537 = tosa.reshape %2536 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2538 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2539 = tosa.transpose %2537, %2538 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %2540 = tosa.reshape %2539 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %2541 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2542 = tosa.transpose %arg196, %2541 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2543 = tosa.reshape %2540 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_593 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2544 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2543, %2542 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_593 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2545 = tosa.reshape %2544 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2546 = tosa.add %2462, %2545 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2547 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_594 = arith.constant 2 : i32
-    %2548 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2546 : tensor<1x40x4096xf32>) outs(%2547 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_594 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2549 = tosa.reduce_sum %2548 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2550 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2551 = tosa.reciprocal %2550 : (tensor<1xf32>) -> tensor<1xf32>
-    %2552 = tosa.mul %2551, %2549 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2553 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2554 = tosa.add %2552, %2553 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2555 = tosa.rsqrt %2554 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2556 = tosa.mul %2546, %2555 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2557 = tosa.reshape %arg197 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2558 = tosa.mul %2557, %2556 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2559 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2560 = tosa.transpose %arg198, %2559 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2561 = tosa.reshape %2558 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_595 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2562 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2561, %2560 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_595 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2563 = tosa.reshape %2562 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2564 = tosa.sigmoid %2563 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2565 = tosa.mul %2563, %2564 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2566 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2567 = tosa.transpose %arg199, %2566 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2568 = tosa.reshape %2558 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_596 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2569 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2568, %2567 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_596 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2570 = tosa.reshape %2569 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2571 = tosa.mul %2565, %2570 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2572 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2573 = tosa.transpose %arg200, %2572 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %2574 = tosa.reshape %2571 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_597 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2575 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2574, %2573 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_597 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2576 = tosa.reshape %2575 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2577 = tosa.add %2546, %2576 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2578 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_598 = arith.constant 2 : i32
-    %2579 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2577 : tensor<1x40x4096xf32>) outs(%2578 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_598 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2580 = tosa.reduce_sum %2579 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2581 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2582 = tosa.reciprocal %2581 : (tensor<1xf32>) -> tensor<1xf32>
-    %2583 = tosa.mul %2582, %2580 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2584 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2585 = tosa.add %2583, %2584 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2586 = tosa.rsqrt %2585 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2587 = tosa.mul %2577, %2586 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2588 = tosa.reshape %arg201 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2589 = tosa.mul %2588, %2587 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2590 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2591 = tosa.transpose %arg202, %2590 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2592 = tosa.reshape %2589 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_599 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2593 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2592, %2591 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_599 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2594 = tosa.reshape %2593 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2595 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2596 = tosa.transpose %arg203, %2595 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2597 = tosa.reshape %2589 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_600 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2598 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2597, %2596 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_600 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2599 = tosa.reshape %2598 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2600 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2601 = tosa.transpose %arg204, %2600 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2602 = tosa.reshape %2589 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_601 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2603 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2602, %2601 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_601 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2604 = tosa.reshape %2603 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2605 = tosa.reshape %2594 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2606 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2607 = tosa.transpose %2605, %2606 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2608 = tosa.reshape %2599 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2609 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2610 = tosa.transpose %2608, %2609 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2611 = tosa.reshape %2604 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2612 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2613 = tosa.transpose %2611, %2612 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2614 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2615 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2616 = tosa.mul %2607, %2614 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_602 = tensor.extract_slice %2607[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_603 = tensor.extract_slice %2607[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2617 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2618 = linalg.negf ins(%extracted_slice_603 : tensor<1x32x40x64xf32>) outs(%2617 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2619 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_604 = tensor.insert_slice %2618 into %2619[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_605 = tensor.insert_slice %extracted_slice_602 into %inserted_slice_604[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2620 = tosa.mul %inserted_slice_605, %2615 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2621 = tosa.add %2616, %2620 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2622 = tosa.mul %2610, %2614 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_606 = tensor.extract_slice %2610[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_607 = tensor.extract_slice %2610[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2623 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2624 = linalg.negf ins(%extracted_slice_607 : tensor<1x32x40x64xf32>) outs(%2623 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2625 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_608 = tensor.insert_slice %2624 into %2625[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_609 = tensor.insert_slice %extracted_slice_606 into %inserted_slice_608[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2626 = tosa.mul %inserted_slice_609, %2615 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2627 = tosa.add %2622, %2626 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2628 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %2629 = tosa.reshape %2628 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_610 = tensor.extract_slice %2629[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_611 = tensor.extract_slice %extracted_slice_610[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %2630 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %2631 = tosa.add %extracted_slice_611, %2630 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_612 = tensor.extract_slice %2631[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_613 = tensor.extract_slice %extracted_slice_612[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_614 = tensor.extract_slice %extracted_slice_613[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_615 = tensor.extract_slice %extracted_slice_614[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_616 = arith.constant 0.000000e+00 : f32
-    %splat_617 = tensor.splat %cst_616 : tensor<40x40xf32>
-    %2632 = tosa.reshape %extracted_slice_615 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %2633 = tosa.add %splat_617, %2632 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %2634 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2635 = tosa.transpose %2627, %2634 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %2636 = tosa.reshape %2621 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2637 = tosa.reshape %2635 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %2638 = tosa.matmul %2636, %2637 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_618 = arith.constant 0.0883883461 : f32
-    %splat_619 = tensor.splat %cst_618 : tensor<32x40x40xf32>
-    %2639 = tosa.mul %2638, %splat_619 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %2640 = tosa.add %2639, %2633 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %2641 = tosa.reduce_max %2640 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2642 = tosa.sub %2640, %2641 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2643 = math.exp %2642 : tensor<32x40x40xf32>
-    %2644 = tosa.reduce_sum %2643 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2645 = tosa.log %2644 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2646 = tosa.add %2641, %2645 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2647 = tosa.sub %2640, %2646 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2648 = math.exp %2647 : tensor<32x40x40xf32>
-    %2649 = tosa.reshape %2646 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %2650 = tosa.reshape %2613 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2651 = tosa.matmul %2648, %2650 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2652 = tosa.reshape %2651 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2653 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2654 = tosa.transpose %2652, %2653 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %2655 = tosa.reshape %2654 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %2656 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2657 = tosa.transpose %arg205, %2656 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2658 = tosa.reshape %2655 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_620 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2659 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2658, %2657 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_620 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2660 = tosa.reshape %2659 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2661 = tosa.add %2577, %2660 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2662 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_621 = arith.constant 2 : i32
-    %2663 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2661 : tensor<1x40x4096xf32>) outs(%2662 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_621 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2664 = tosa.reduce_sum %2663 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2665 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2666 = tosa.reciprocal %2665 : (tensor<1xf32>) -> tensor<1xf32>
-    %2667 = tosa.mul %2666, %2664 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2668 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2669 = tosa.add %2667, %2668 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2670 = tosa.rsqrt %2669 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2671 = tosa.mul %2661, %2670 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2672 = tosa.reshape %arg206 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2673 = tosa.mul %2672, %2671 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2674 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2675 = tosa.transpose %arg207, %2674 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2676 = tosa.reshape %2673 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_622 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2677 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2676, %2675 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_622 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2678 = tosa.reshape %2677 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2679 = tosa.sigmoid %2678 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2680 = tosa.mul %2678, %2679 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2681 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2682 = tosa.transpose %arg208, %2681 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2683 = tosa.reshape %2673 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_623 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2684 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2683, %2682 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_623 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2685 = tosa.reshape %2684 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2686 = tosa.mul %2680, %2685 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2687 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2688 = tosa.transpose %arg209, %2687 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %2689 = tosa.reshape %2686 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_624 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2690 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2689, %2688 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_624 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2691 = tosa.reshape %2690 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2692 = tosa.add %2661, %2691 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2693 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_625 = arith.constant 2 : i32
-    %2694 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2692 : tensor<1x40x4096xf32>) outs(%2693 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_625 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2695 = tosa.reduce_sum %2694 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2696 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2697 = tosa.reciprocal %2696 : (tensor<1xf32>) -> tensor<1xf32>
-    %2698 = tosa.mul %2697, %2695 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2699 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2700 = tosa.add %2698, %2699 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2701 = tosa.rsqrt %2700 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2702 = tosa.mul %2692, %2701 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2703 = tosa.reshape %arg210 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2704 = tosa.mul %2703, %2702 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2705 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2706 = tosa.transpose %arg211, %2705 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2707 = tosa.reshape %2704 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_626 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2708 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2707, %2706 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_626 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2709 = tosa.reshape %2708 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2710 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2711 = tosa.transpose %arg212, %2710 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2712 = tosa.reshape %2704 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_627 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2713 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2712, %2711 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_627 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2714 = tosa.reshape %2713 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2715 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2716 = tosa.transpose %arg213, %2715 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2717 = tosa.reshape %2704 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_628 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2718 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2717, %2716 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_628 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2719 = tosa.reshape %2718 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2720 = tosa.reshape %2709 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2721 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2722 = tosa.transpose %2720, %2721 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2723 = tosa.reshape %2714 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2724 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2725 = tosa.transpose %2723, %2724 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2726 = tosa.reshape %2719 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2727 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2728 = tosa.transpose %2726, %2727 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2729 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2730 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2731 = tosa.mul %2722, %2729 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_629 = tensor.extract_slice %2722[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_630 = tensor.extract_slice %2722[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2732 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2733 = linalg.negf ins(%extracted_slice_630 : tensor<1x32x40x64xf32>) outs(%2732 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2734 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_631 = tensor.insert_slice %2733 into %2734[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_632 = tensor.insert_slice %extracted_slice_629 into %inserted_slice_631[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2735 = tosa.mul %inserted_slice_632, %2730 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2736 = tosa.add %2731, %2735 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2737 = tosa.mul %2725, %2729 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_633 = tensor.extract_slice %2725[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_634 = tensor.extract_slice %2725[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2738 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2739 = linalg.negf ins(%extracted_slice_634 : tensor<1x32x40x64xf32>) outs(%2738 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2740 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_635 = tensor.insert_slice %2739 into %2740[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_636 = tensor.insert_slice %extracted_slice_633 into %inserted_slice_635[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2741 = tosa.mul %inserted_slice_636, %2730 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2742 = tosa.add %2737, %2741 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2743 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %2744 = tosa.reshape %2743 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_637 = tensor.extract_slice %2744[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_638 = tensor.extract_slice %extracted_slice_637[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %2745 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %2746 = tosa.add %extracted_slice_638, %2745 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_639 = tensor.extract_slice %2746[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_640 = tensor.extract_slice %extracted_slice_639[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_641 = tensor.extract_slice %extracted_slice_640[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_642 = tensor.extract_slice %extracted_slice_641[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_643 = arith.constant 0.000000e+00 : f32
-    %splat_644 = tensor.splat %cst_643 : tensor<40x40xf32>
-    %2747 = tosa.reshape %extracted_slice_642 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %2748 = tosa.add %splat_644, %2747 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %2749 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2750 = tosa.transpose %2742, %2749 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %2751 = tosa.reshape %2736 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2752 = tosa.reshape %2750 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %2753 = tosa.matmul %2751, %2752 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_645 = arith.constant 0.0883883461 : f32
-    %splat_646 = tensor.splat %cst_645 : tensor<32x40x40xf32>
-    %2754 = tosa.mul %2753, %splat_646 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %2755 = tosa.add %2754, %2748 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %2756 = tosa.reduce_max %2755 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2757 = tosa.sub %2755, %2756 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2758 = math.exp %2757 : tensor<32x40x40xf32>
-    %2759 = tosa.reduce_sum %2758 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2760 = tosa.log %2759 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2761 = tosa.add %2756, %2760 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2762 = tosa.sub %2755, %2761 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2763 = math.exp %2762 : tensor<32x40x40xf32>
-    %2764 = tosa.reshape %2761 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %2765 = tosa.reshape %2728 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2766 = tosa.matmul %2763, %2765 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2767 = tosa.reshape %2766 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2768 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2769 = tosa.transpose %2767, %2768 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %2770 = tosa.reshape %2769 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %2771 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2772 = tosa.transpose %arg214, %2771 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2773 = tosa.reshape %2770 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_647 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2774 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2773, %2772 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_647 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2775 = tosa.reshape %2774 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2776 = tosa.add %2692, %2775 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2777 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_648 = arith.constant 2 : i32
-    %2778 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2776 : tensor<1x40x4096xf32>) outs(%2777 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_648 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2779 = tosa.reduce_sum %2778 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2780 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2781 = tosa.reciprocal %2780 : (tensor<1xf32>) -> tensor<1xf32>
-    %2782 = tosa.mul %2781, %2779 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2783 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2784 = tosa.add %2782, %2783 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2785 = tosa.rsqrt %2784 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2786 = tosa.mul %2776, %2785 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2787 = tosa.reshape %arg215 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2788 = tosa.mul %2787, %2786 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2789 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2790 = tosa.transpose %arg216, %2789 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2791 = tosa.reshape %2788 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_649 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2792 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2791, %2790 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_649 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2793 = tosa.reshape %2792 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2794 = tosa.sigmoid %2793 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2795 = tosa.mul %2793, %2794 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2796 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2797 = tosa.transpose %arg217, %2796 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2798 = tosa.reshape %2788 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_650 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2799 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2798, %2797 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_650 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2800 = tosa.reshape %2799 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2801 = tosa.mul %2795, %2800 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2802 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2803 = tosa.transpose %arg218, %2802 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %2804 = tosa.reshape %2801 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_651 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2805 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2804, %2803 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_651 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2806 = tosa.reshape %2805 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2807 = tosa.add %2776, %2806 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2808 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_652 = arith.constant 2 : i32
-    %2809 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2807 : tensor<1x40x4096xf32>) outs(%2808 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_652 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2810 = tosa.reduce_sum %2809 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2811 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2812 = tosa.reciprocal %2811 : (tensor<1xf32>) -> tensor<1xf32>
-    %2813 = tosa.mul %2812, %2810 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2814 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2815 = tosa.add %2813, %2814 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2816 = tosa.rsqrt %2815 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2817 = tosa.mul %2807, %2816 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2818 = tosa.reshape %arg219 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2819 = tosa.mul %2818, %2817 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2820 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2821 = tosa.transpose %arg220, %2820 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2822 = tosa.reshape %2819 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_653 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2823 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2822, %2821 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_653 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2824 = tosa.reshape %2823 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2825 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2826 = tosa.transpose %arg221, %2825 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2827 = tosa.reshape %2819 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_654 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2828 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2827, %2826 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_654 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2829 = tosa.reshape %2828 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2830 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2831 = tosa.transpose %arg222, %2830 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2832 = tosa.reshape %2819 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_655 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2833 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2832, %2831 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_655 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2834 = tosa.reshape %2833 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2835 = tosa.reshape %2824 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2836 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2837 = tosa.transpose %2835, %2836 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2838 = tosa.reshape %2829 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2839 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2840 = tosa.transpose %2838, %2839 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2841 = tosa.reshape %2834 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2842 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2843 = tosa.transpose %2841, %2842 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2844 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2845 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2846 = tosa.mul %2837, %2844 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_656 = tensor.extract_slice %2837[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_657 = tensor.extract_slice %2837[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2847 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2848 = linalg.negf ins(%extracted_slice_657 : tensor<1x32x40x64xf32>) outs(%2847 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2849 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_658 = tensor.insert_slice %2848 into %2849[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_659 = tensor.insert_slice %extracted_slice_656 into %inserted_slice_658[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2850 = tosa.mul %inserted_slice_659, %2845 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2851 = tosa.add %2846, %2850 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2852 = tosa.mul %2840, %2844 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_660 = tensor.extract_slice %2840[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_661 = tensor.extract_slice %2840[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2853 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2854 = linalg.negf ins(%extracted_slice_661 : tensor<1x32x40x64xf32>) outs(%2853 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2855 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_662 = tensor.insert_slice %2854 into %2855[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_663 = tensor.insert_slice %extracted_slice_660 into %inserted_slice_662[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2856 = tosa.mul %inserted_slice_663, %2845 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2857 = tosa.add %2852, %2856 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2858 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %2859 = tosa.reshape %2858 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_664 = tensor.extract_slice %2859[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_665 = tensor.extract_slice %extracted_slice_664[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %2860 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %2861 = tosa.add %extracted_slice_665, %2860 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_666 = tensor.extract_slice %2861[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_667 = tensor.extract_slice %extracted_slice_666[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_668 = tensor.extract_slice %extracted_slice_667[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_669 = tensor.extract_slice %extracted_slice_668[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_670 = arith.constant 0.000000e+00 : f32
-    %splat_671 = tensor.splat %cst_670 : tensor<40x40xf32>
-    %2862 = tosa.reshape %extracted_slice_669 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %2863 = tosa.add %splat_671, %2862 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %2864 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2865 = tosa.transpose %2857, %2864 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %2866 = tosa.reshape %2851 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2867 = tosa.reshape %2865 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %2868 = tosa.matmul %2866, %2867 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_672 = arith.constant 0.0883883461 : f32
-    %splat_673 = tensor.splat %cst_672 : tensor<32x40x40xf32>
-    %2869 = tosa.mul %2868, %splat_673 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %2870 = tosa.add %2869, %2863 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %2871 = tosa.reduce_max %2870 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2872 = tosa.sub %2870, %2871 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2873 = math.exp %2872 : tensor<32x40x40xf32>
-    %2874 = tosa.reduce_sum %2873 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2875 = tosa.log %2874 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2876 = tosa.add %2871, %2875 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2877 = tosa.sub %2870, %2876 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2878 = math.exp %2877 : tensor<32x40x40xf32>
-    %2879 = tosa.reshape %2876 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %2880 = tosa.reshape %2843 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2881 = tosa.matmul %2878, %2880 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2882 = tosa.reshape %2881 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2883 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2884 = tosa.transpose %2882, %2883 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %2885 = tosa.reshape %2884 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %2886 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2887 = tosa.transpose %arg223, %2886 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2888 = tosa.reshape %2885 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_674 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2889 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2888, %2887 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_674 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2890 = tosa.reshape %2889 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2891 = tosa.add %2807, %2890 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2892 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_675 = arith.constant 2 : i32
-    %2893 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2891 : tensor<1x40x4096xf32>) outs(%2892 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_675 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2894 = tosa.reduce_sum %2893 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2895 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2896 = tosa.reciprocal %2895 : (tensor<1xf32>) -> tensor<1xf32>
-    %2897 = tosa.mul %2896, %2894 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2898 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2899 = tosa.add %2897, %2898 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2900 = tosa.rsqrt %2899 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2901 = tosa.mul %2891, %2900 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2902 = tosa.reshape %arg224 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2903 = tosa.mul %2902, %2901 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2904 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2905 = tosa.transpose %arg225, %2904 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2906 = tosa.reshape %2903 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_676 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2907 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2906, %2905 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_676 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2908 = tosa.reshape %2907 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2909 = tosa.sigmoid %2908 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2910 = tosa.mul %2908, %2909 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2911 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2912 = tosa.transpose %arg226, %2911 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %2913 = tosa.reshape %2903 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_677 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %2914 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2913, %2912 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_677 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %2915 = tosa.reshape %2914 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2916 = tosa.mul %2910, %2915 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %2917 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2918 = tosa.transpose %arg227, %2917 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %2919 = tosa.reshape %2916 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_678 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2920 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2919, %2918 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_678 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2921 = tosa.reshape %2920 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2922 = tosa.add %2891, %2921 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2923 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_679 = arith.constant 2 : i32
-    %2924 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2922 : tensor<1x40x4096xf32>) outs(%2923 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_679 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %2925 = tosa.reduce_sum %2924 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %2926 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %2927 = tosa.reciprocal %2926 : (tensor<1xf32>) -> tensor<1xf32>
-    %2928 = tosa.mul %2927, %2925 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2929 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %2930 = tosa.add %2928, %2929 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2931 = tosa.rsqrt %2930 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %2932 = tosa.mul %2922, %2931 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %2933 = tosa.reshape %arg228 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %2934 = tosa.mul %2933, %2932 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2935 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2936 = tosa.transpose %arg229, %2935 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2937 = tosa.reshape %2934 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_680 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2938 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2937, %2936 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_680 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2939 = tosa.reshape %2938 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2940 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2941 = tosa.transpose %arg230, %2940 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2942 = tosa.reshape %2934 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_681 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2943 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2942, %2941 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_681 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2944 = tosa.reshape %2943 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2945 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %2946 = tosa.transpose %arg231, %2945 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %2947 = tosa.reshape %2934 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_682 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %2948 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%2947, %2946 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_682 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %2949 = tosa.reshape %2948 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %2950 = tosa.reshape %2939 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2951 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2952 = tosa.transpose %2950, %2951 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2953 = tosa.reshape %2944 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2954 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2955 = tosa.transpose %2953, %2954 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2956 = tosa.reshape %2949 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %2957 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2958 = tosa.transpose %2956, %2957 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %2959 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2960 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %2961 = tosa.mul %2952, %2959 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_683 = tensor.extract_slice %2952[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_684 = tensor.extract_slice %2952[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2962 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2963 = linalg.negf ins(%extracted_slice_684 : tensor<1x32x40x64xf32>) outs(%2962 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2964 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_685 = tensor.insert_slice %2963 into %2964[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_686 = tensor.insert_slice %extracted_slice_683 into %inserted_slice_685[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2965 = tosa.mul %inserted_slice_686, %2960 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2966 = tosa.add %2961, %2965 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2967 = tosa.mul %2955, %2959 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_687 = tensor.extract_slice %2955[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_688 = tensor.extract_slice %2955[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %2968 = tensor.empty() : tensor<1x32x40x64xf32>
-    %2969 = linalg.negf ins(%extracted_slice_688 : tensor<1x32x40x64xf32>) outs(%2968 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %2970 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_689 = tensor.insert_slice %2969 into %2970[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_690 = tensor.insert_slice %extracted_slice_687 into %inserted_slice_689[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %2971 = tosa.mul %inserted_slice_690, %2960 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2972 = tosa.add %2967, %2971 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2973 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %2974 = tosa.reshape %2973 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_691 = tensor.extract_slice %2974[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_692 = tensor.extract_slice %extracted_slice_691[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %2975 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %2976 = tosa.add %extracted_slice_692, %2975 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_693 = tensor.extract_slice %2976[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_694 = tensor.extract_slice %extracted_slice_693[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_695 = tensor.extract_slice %extracted_slice_694[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_696 = tensor.extract_slice %extracted_slice_695[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_697 = arith.constant 0.000000e+00 : f32
-    %splat_698 = tensor.splat %cst_697 : tensor<40x40xf32>
-    %2977 = tosa.reshape %extracted_slice_696 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %2978 = tosa.add %splat_698, %2977 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %2979 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2980 = tosa.transpose %2972, %2979 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %2981 = tosa.reshape %2966 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2982 = tosa.reshape %2980 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %2983 = tosa.matmul %2981, %2982 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_699 = arith.constant 0.0883883461 : f32
-    %splat_700 = tensor.splat %cst_699 : tensor<32x40x40xf32>
-    %2984 = tosa.mul %2983, %splat_700 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %2985 = tosa.add %2984, %2978 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %2986 = tosa.reduce_max %2985 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2987 = tosa.sub %2985, %2986 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2988 = math.exp %2987 : tensor<32x40x40xf32>
-    %2989 = tosa.reduce_sum %2988 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %2990 = tosa.log %2989 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2991 = tosa.add %2986, %2990 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %2992 = tosa.sub %2985, %2991 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %2993 = math.exp %2992 : tensor<32x40x40xf32>
-    %2994 = tosa.reshape %2991 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %2995 = tosa.reshape %2958 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2996 = tosa.matmul %2993, %2995 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %2997 = tosa.reshape %2996 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %2998 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %2999 = tosa.transpose %2997, %2998 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %3000 = tosa.reshape %2999 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %3001 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3002 = tosa.transpose %arg232, %3001 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3003 = tosa.reshape %3000 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_701 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3004 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3003, %3002 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_701 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3005 = tosa.reshape %3004 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3006 = tosa.add %2922, %3005 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3007 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_702 = arith.constant 2 : i32
-    %3008 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3006 : tensor<1x40x4096xf32>) outs(%3007 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_702 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %3009 = tosa.reduce_sum %3008 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %3010 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %3011 = tosa.reciprocal %3010 : (tensor<1xf32>) -> tensor<1xf32>
-    %3012 = tosa.mul %3011, %3009 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3013 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %3014 = tosa.add %3012, %3013 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3015 = tosa.rsqrt %3014 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3016 = tosa.mul %3006, %3015 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %3017 = tosa.reshape %arg233 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %3018 = tosa.mul %3017, %3016 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3019 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3020 = tosa.transpose %arg234, %3019 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %3021 = tosa.reshape %3018 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_703 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %3022 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3021, %3020 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_703 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %3023 = tosa.reshape %3022 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3024 = tosa.sigmoid %3023 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3025 = tosa.mul %3023, %3024 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3026 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3027 = tosa.transpose %arg235, %3026 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %3028 = tosa.reshape %3018 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_704 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %3029 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3028, %3027 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_704 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %3030 = tosa.reshape %3029 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3031 = tosa.mul %3025, %3030 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3032 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3033 = tosa.transpose %arg236, %3032 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %3034 = tosa.reshape %3031 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_705 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3035 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3034, %3033 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_705 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3036 = tosa.reshape %3035 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3037 = tosa.add %3006, %3036 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3038 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_706 = arith.constant 2 : i32
-    %3039 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3037 : tensor<1x40x4096xf32>) outs(%3038 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_706 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %3040 = tosa.reduce_sum %3039 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %3041 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %3042 = tosa.reciprocal %3041 : (tensor<1xf32>) -> tensor<1xf32>
-    %3043 = tosa.mul %3042, %3040 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3044 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %3045 = tosa.add %3043, %3044 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3046 = tosa.rsqrt %3045 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3047 = tosa.mul %3037, %3046 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %3048 = tosa.reshape %arg237 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %3049 = tosa.mul %3048, %3047 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3050 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3051 = tosa.transpose %arg238, %3050 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3052 = tosa.reshape %3049 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_707 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3053 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3052, %3051 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_707 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3054 = tosa.reshape %3053 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3055 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3056 = tosa.transpose %arg239, %3055 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3057 = tosa.reshape %3049 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_708 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3058 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3057, %3056 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_708 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3059 = tosa.reshape %3058 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3060 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3061 = tosa.transpose %arg240, %3060 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3062 = tosa.reshape %3049 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_709 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3063 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3062, %3061 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_709 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3064 = tosa.reshape %3063 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3065 = tosa.reshape %3054 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3066 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3067 = tosa.transpose %3065, %3066 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3068 = tosa.reshape %3059 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3069 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3070 = tosa.transpose %3068, %3069 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3071 = tosa.reshape %3064 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3072 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3073 = tosa.transpose %3071, %3072 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3074 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %3075 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %3076 = tosa.mul %3067, %3074 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_710 = tensor.extract_slice %3067[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_711 = tensor.extract_slice %3067[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %3077 = tensor.empty() : tensor<1x32x40x64xf32>
-    %3078 = linalg.negf ins(%extracted_slice_711 : tensor<1x32x40x64xf32>) outs(%3077 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %3079 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_712 = tensor.insert_slice %3078 into %3079[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_713 = tensor.insert_slice %extracted_slice_710 into %inserted_slice_712[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %3080 = tosa.mul %inserted_slice_713, %3075 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3081 = tosa.add %3076, %3080 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3082 = tosa.mul %3070, %3074 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_714 = tensor.extract_slice %3070[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_715 = tensor.extract_slice %3070[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %3083 = tensor.empty() : tensor<1x32x40x64xf32>
-    %3084 = linalg.negf ins(%extracted_slice_715 : tensor<1x32x40x64xf32>) outs(%3083 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %3085 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_716 = tensor.insert_slice %3084 into %3085[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_717 = tensor.insert_slice %extracted_slice_714 into %inserted_slice_716[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %3086 = tosa.mul %inserted_slice_717, %3075 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3087 = tosa.add %3082, %3086 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3088 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %3089 = tosa.reshape %3088 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_718 = tensor.extract_slice %3089[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_719 = tensor.extract_slice %extracted_slice_718[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %3090 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %3091 = tosa.add %extracted_slice_719, %3090 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_720 = tensor.extract_slice %3091[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_721 = tensor.extract_slice %extracted_slice_720[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_722 = tensor.extract_slice %extracted_slice_721[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_723 = tensor.extract_slice %extracted_slice_722[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_724 = arith.constant 0.000000e+00 : f32
-    %splat_725 = tensor.splat %cst_724 : tensor<40x40xf32>
-    %3092 = tosa.reshape %extracted_slice_723 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %3093 = tosa.add %splat_725, %3092 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %3094 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3095 = tosa.transpose %3087, %3094 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %3096 = tosa.reshape %3081 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3097 = tosa.reshape %3095 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %3098 = tosa.matmul %3096, %3097 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_726 = arith.constant 0.0883883461 : f32
-    %splat_727 = tensor.splat %cst_726 : tensor<32x40x40xf32>
-    %3099 = tosa.mul %3098, %splat_727 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %3100 = tosa.add %3099, %3093 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %3101 = tosa.reduce_max %3100 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %3102 = tosa.sub %3100, %3101 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %3103 = math.exp %3102 : tensor<32x40x40xf32>
-    %3104 = tosa.reduce_sum %3103 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %3105 = tosa.log %3104 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %3106 = tosa.add %3101, %3105 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %3107 = tosa.sub %3100, %3106 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %3108 = math.exp %3107 : tensor<32x40x40xf32>
-    %3109 = tosa.reshape %3106 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %3110 = tosa.reshape %3073 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3111 = tosa.matmul %3108, %3110 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3112 = tosa.reshape %3111 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3113 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3114 = tosa.transpose %3112, %3113 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %3115 = tosa.reshape %3114 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %3116 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3117 = tosa.transpose %arg241, %3116 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3118 = tosa.reshape %3115 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_728 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3119 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3118, %3117 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_728 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3120 = tosa.reshape %3119 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3121 = tosa.add %3037, %3120 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3122 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_729 = arith.constant 2 : i32
-    %3123 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3121 : tensor<1x40x4096xf32>) outs(%3122 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_729 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %3124 = tosa.reduce_sum %3123 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %3125 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %3126 = tosa.reciprocal %3125 : (tensor<1xf32>) -> tensor<1xf32>
-    %3127 = tosa.mul %3126, %3124 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3128 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %3129 = tosa.add %3127, %3128 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3130 = tosa.rsqrt %3129 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3131 = tosa.mul %3121, %3130 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %3132 = tosa.reshape %arg242 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %3133 = tosa.mul %3132, %3131 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3134 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3135 = tosa.transpose %arg243, %3134 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %3136 = tosa.reshape %3133 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_730 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %3137 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3136, %3135 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_730 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %3138 = tosa.reshape %3137 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3139 = tosa.sigmoid %3138 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3140 = tosa.mul %3138, %3139 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3141 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3142 = tosa.transpose %arg244, %3141 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %3143 = tosa.reshape %3133 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_731 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %3144 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3143, %3142 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_731 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %3145 = tosa.reshape %3144 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3146 = tosa.mul %3140, %3145 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3147 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3148 = tosa.transpose %arg245, %3147 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %3149 = tosa.reshape %3146 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_732 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3150 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3149, %3148 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_732 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3151 = tosa.reshape %3150 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3152 = tosa.add %3121, %3151 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3153 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_733 = arith.constant 2 : i32
-    %3154 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3152 : tensor<1x40x4096xf32>) outs(%3153 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_733 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %3155 = tosa.reduce_sum %3154 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %3156 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %3157 = tosa.reciprocal %3156 : (tensor<1xf32>) -> tensor<1xf32>
-    %3158 = tosa.mul %3157, %3155 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3159 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %3160 = tosa.add %3158, %3159 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3161 = tosa.rsqrt %3160 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3162 = tosa.mul %3152, %3161 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %3163 = tosa.reshape %arg246 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %3164 = tosa.mul %3163, %3162 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3165 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3166 = tosa.transpose %arg247, %3165 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3167 = tosa.reshape %3164 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_734 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3168 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3167, %3166 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_734 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3169 = tosa.reshape %3168 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3170 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3171 = tosa.transpose %arg248, %3170 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3172 = tosa.reshape %3164 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_735 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3173 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3172, %3171 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_735 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3174 = tosa.reshape %3173 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3175 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3176 = tosa.transpose %arg249, %3175 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3177 = tosa.reshape %3164 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_736 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3178 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3177, %3176 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_736 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3179 = tosa.reshape %3178 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3180 = tosa.reshape %3169 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3181 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3182 = tosa.transpose %3180, %3181 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3183 = tosa.reshape %3174 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3184 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3185 = tosa.transpose %3183, %3184 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3186 = tosa.reshape %3179 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3187 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3188 = tosa.transpose %3186, %3187 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3189 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %3190 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %3191 = tosa.mul %3182, %3189 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_737 = tensor.extract_slice %3182[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_738 = tensor.extract_slice %3182[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %3192 = tensor.empty() : tensor<1x32x40x64xf32>
-    %3193 = linalg.negf ins(%extracted_slice_738 : tensor<1x32x40x64xf32>) outs(%3192 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %3194 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_739 = tensor.insert_slice %3193 into %3194[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_740 = tensor.insert_slice %extracted_slice_737 into %inserted_slice_739[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %3195 = tosa.mul %inserted_slice_740, %3190 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3196 = tosa.add %3191, %3195 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3197 = tosa.mul %3185, %3189 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_741 = tensor.extract_slice %3185[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_742 = tensor.extract_slice %3185[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %3198 = tensor.empty() : tensor<1x32x40x64xf32>
-    %3199 = linalg.negf ins(%extracted_slice_742 : tensor<1x32x40x64xf32>) outs(%3198 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %3200 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_743 = tensor.insert_slice %3199 into %3200[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_744 = tensor.insert_slice %extracted_slice_741 into %inserted_slice_743[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %3201 = tosa.mul %inserted_slice_744, %3190 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3202 = tosa.add %3197, %3201 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3203 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %3204 = tosa.reshape %3203 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_745 = tensor.extract_slice %3204[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_746 = tensor.extract_slice %extracted_slice_745[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %3205 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %3206 = tosa.add %extracted_slice_746, %3205 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_747 = tensor.extract_slice %3206[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_748 = tensor.extract_slice %extracted_slice_747[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_749 = tensor.extract_slice %extracted_slice_748[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_750 = tensor.extract_slice %extracted_slice_749[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_751 = arith.constant 0.000000e+00 : f32
-    %splat_752 = tensor.splat %cst_751 : tensor<40x40xf32>
-    %3207 = tosa.reshape %extracted_slice_750 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %3208 = tosa.add %splat_752, %3207 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %3209 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3210 = tosa.transpose %3202, %3209 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %3211 = tosa.reshape %3196 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3212 = tosa.reshape %3210 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %3213 = tosa.matmul %3211, %3212 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_753 = arith.constant 0.0883883461 : f32
-    %splat_754 = tensor.splat %cst_753 : tensor<32x40x40xf32>
-    %3214 = tosa.mul %3213, %splat_754 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %3215 = tosa.add %3214, %3208 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %3216 = tosa.reduce_max %3215 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %3217 = tosa.sub %3215, %3216 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %3218 = math.exp %3217 : tensor<32x40x40xf32>
-    %3219 = tosa.reduce_sum %3218 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %3220 = tosa.log %3219 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %3221 = tosa.add %3216, %3220 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %3222 = tosa.sub %3215, %3221 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %3223 = math.exp %3222 : tensor<32x40x40xf32>
-    %3224 = tosa.reshape %3221 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %3225 = tosa.reshape %3188 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3226 = tosa.matmul %3223, %3225 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3227 = tosa.reshape %3226 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3228 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3229 = tosa.transpose %3227, %3228 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %3230 = tosa.reshape %3229 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %3231 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3232 = tosa.transpose %arg250, %3231 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3233 = tosa.reshape %3230 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_755 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3234 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3233, %3232 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_755 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3235 = tosa.reshape %3234 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3236 = tosa.add %3152, %3235 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3237 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_756 = arith.constant 2 : i32
-    %3238 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3236 : tensor<1x40x4096xf32>) outs(%3237 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_756 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %3239 = tosa.reduce_sum %3238 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %3240 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %3241 = tosa.reciprocal %3240 : (tensor<1xf32>) -> tensor<1xf32>
-    %3242 = tosa.mul %3241, %3239 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3243 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %3244 = tosa.add %3242, %3243 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3245 = tosa.rsqrt %3244 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3246 = tosa.mul %3236, %3245 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %3247 = tosa.reshape %arg251 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %3248 = tosa.mul %3247, %3246 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3249 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3250 = tosa.transpose %arg252, %3249 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %3251 = tosa.reshape %3248 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_757 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %3252 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3251, %3250 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_757 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %3253 = tosa.reshape %3252 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3254 = tosa.sigmoid %3253 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3255 = tosa.mul %3253, %3254 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3256 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3257 = tosa.transpose %arg253, %3256 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %3258 = tosa.reshape %3248 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_758 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %3259 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3258, %3257 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_758 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %3260 = tosa.reshape %3259 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3261 = tosa.mul %3255, %3260 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3262 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3263 = tosa.transpose %arg254, %3262 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %3264 = tosa.reshape %3261 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_759 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3265 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3264, %3263 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_759 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3266 = tosa.reshape %3265 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3267 = tosa.add %3236, %3266 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3268 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_760 = arith.constant 2 : i32
-    %3269 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3267 : tensor<1x40x4096xf32>) outs(%3268 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_760 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %3270 = tosa.reduce_sum %3269 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %3271 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %3272 = tosa.reciprocal %3271 : (tensor<1xf32>) -> tensor<1xf32>
-    %3273 = tosa.mul %3272, %3270 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3274 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %3275 = tosa.add %3273, %3274 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3276 = tosa.rsqrt %3275 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3277 = tosa.mul %3267, %3276 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %3278 = tosa.reshape %arg255 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %3279 = tosa.mul %3278, %3277 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3280 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3281 = tosa.transpose %arg256, %3280 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3282 = tosa.reshape %3279 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_761 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3283 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3282, %3281 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_761 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3284 = tosa.reshape %3283 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3285 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3286 = tosa.transpose %arg257, %3285 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3287 = tosa.reshape %3279 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_762 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3288 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3287, %3286 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_762 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3289 = tosa.reshape %3288 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3290 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3291 = tosa.transpose %arg258, %3290 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3292 = tosa.reshape %3279 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_763 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3293 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3292, %3291 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_763 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3294 = tosa.reshape %3293 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3295 = tosa.reshape %3284 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3296 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3297 = tosa.transpose %3295, %3296 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3298 = tosa.reshape %3289 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3299 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3300 = tosa.transpose %3298, %3299 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3301 = tosa.reshape %3294 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3302 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3303 = tosa.transpose %3301, %3302 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3304 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %3305 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %3306 = tosa.mul %3297, %3304 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_764 = tensor.extract_slice %3297[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_765 = tensor.extract_slice %3297[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %3307 = tensor.empty() : tensor<1x32x40x64xf32>
-    %3308 = linalg.negf ins(%extracted_slice_765 : tensor<1x32x40x64xf32>) outs(%3307 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %3309 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_766 = tensor.insert_slice %3308 into %3309[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_767 = tensor.insert_slice %extracted_slice_764 into %inserted_slice_766[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %3310 = tosa.mul %inserted_slice_767, %3305 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3311 = tosa.add %3306, %3310 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3312 = tosa.mul %3300, %3304 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_768 = tensor.extract_slice %3300[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_769 = tensor.extract_slice %3300[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %3313 = tensor.empty() : tensor<1x32x40x64xf32>
-    %3314 = linalg.negf ins(%extracted_slice_769 : tensor<1x32x40x64xf32>) outs(%3313 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %3315 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_770 = tensor.insert_slice %3314 into %3315[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_771 = tensor.insert_slice %extracted_slice_768 into %inserted_slice_770[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %3316 = tosa.mul %inserted_slice_771, %3305 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3317 = tosa.add %3312, %3316 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3318 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %3319 = tosa.reshape %3318 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_772 = tensor.extract_slice %3319[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_773 = tensor.extract_slice %extracted_slice_772[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %3320 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %3321 = tosa.add %extracted_slice_773, %3320 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_774 = tensor.extract_slice %3321[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_775 = tensor.extract_slice %extracted_slice_774[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_776 = tensor.extract_slice %extracted_slice_775[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_777 = tensor.extract_slice %extracted_slice_776[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_778 = arith.constant 0.000000e+00 : f32
-    %splat_779 = tensor.splat %cst_778 : tensor<40x40xf32>
-    %3322 = tosa.reshape %extracted_slice_777 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %3323 = tosa.add %splat_779, %3322 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %3324 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3325 = tosa.transpose %3317, %3324 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %3326 = tosa.reshape %3311 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3327 = tosa.reshape %3325 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %3328 = tosa.matmul %3326, %3327 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_780 = arith.constant 0.0883883461 : f32
-    %splat_781 = tensor.splat %cst_780 : tensor<32x40x40xf32>
-    %3329 = tosa.mul %3328, %splat_781 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %3330 = tosa.add %3329, %3323 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %3331 = tosa.reduce_max %3330 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %3332 = tosa.sub %3330, %3331 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %3333 = math.exp %3332 : tensor<32x40x40xf32>
-    %3334 = tosa.reduce_sum %3333 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %3335 = tosa.log %3334 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %3336 = tosa.add %3331, %3335 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %3337 = tosa.sub %3330, %3336 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %3338 = math.exp %3337 : tensor<32x40x40xf32>
-    %3339 = tosa.reshape %3336 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %3340 = tosa.reshape %3303 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3341 = tosa.matmul %3338, %3340 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3342 = tosa.reshape %3341 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3343 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3344 = tosa.transpose %3342, %3343 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %3345 = tosa.reshape %3344 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %3346 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3347 = tosa.transpose %arg259, %3346 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3348 = tosa.reshape %3345 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_782 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3349 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3348, %3347 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_782 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3350 = tosa.reshape %3349 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3351 = tosa.add %3267, %3350 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3352 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_783 = arith.constant 2 : i32
-    %3353 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3351 : tensor<1x40x4096xf32>) outs(%3352 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_783 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %3354 = tosa.reduce_sum %3353 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %3355 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %3356 = tosa.reciprocal %3355 : (tensor<1xf32>) -> tensor<1xf32>
-    %3357 = tosa.mul %3356, %3354 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3358 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %3359 = tosa.add %3357, %3358 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3360 = tosa.rsqrt %3359 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3361 = tosa.mul %3351, %3360 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %3362 = tosa.reshape %arg260 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %3363 = tosa.mul %3362, %3361 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3364 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3365 = tosa.transpose %arg261, %3364 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %3366 = tosa.reshape %3363 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_784 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %3367 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3366, %3365 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_784 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %3368 = tosa.reshape %3367 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3369 = tosa.sigmoid %3368 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3370 = tosa.mul %3368, %3369 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3371 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3372 = tosa.transpose %arg262, %3371 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %3373 = tosa.reshape %3363 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_785 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %3374 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3373, %3372 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_785 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %3375 = tosa.reshape %3374 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3376 = tosa.mul %3370, %3375 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3377 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3378 = tosa.transpose %arg263, %3377 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %3379 = tosa.reshape %3376 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_786 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3380 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3379, %3378 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_786 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3381 = tosa.reshape %3380 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3382 = tosa.add %3351, %3381 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3383 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_787 = arith.constant 2 : i32
-    %3384 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3382 : tensor<1x40x4096xf32>) outs(%3383 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_787 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %3385 = tosa.reduce_sum %3384 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %3386 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %3387 = tosa.reciprocal %3386 : (tensor<1xf32>) -> tensor<1xf32>
-    %3388 = tosa.mul %3387, %3385 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3389 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %3390 = tosa.add %3388, %3389 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3391 = tosa.rsqrt %3390 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3392 = tosa.mul %3382, %3391 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %3393 = tosa.reshape %arg264 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %3394 = tosa.mul %3393, %3392 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3395 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3396 = tosa.transpose %arg265, %3395 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3397 = tosa.reshape %3394 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_788 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3398 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3397, %3396 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_788 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3399 = tosa.reshape %3398 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3400 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3401 = tosa.transpose %arg266, %3400 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3402 = tosa.reshape %3394 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_789 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3403 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3402, %3401 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_789 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3404 = tosa.reshape %3403 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3405 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3406 = tosa.transpose %arg267, %3405 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3407 = tosa.reshape %3394 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_790 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3408 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3407, %3406 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_790 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3409 = tosa.reshape %3408 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3410 = tosa.reshape %3399 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3411 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3412 = tosa.transpose %3410, %3411 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3413 = tosa.reshape %3404 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3414 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3415 = tosa.transpose %3413, %3414 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3416 = tosa.reshape %3409 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3417 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3418 = tosa.transpose %3416, %3417 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3419 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %3420 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %3421 = tosa.mul %3412, %3419 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_791 = tensor.extract_slice %3412[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_792 = tensor.extract_slice %3412[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %3422 = tensor.empty() : tensor<1x32x40x64xf32>
-    %3423 = linalg.negf ins(%extracted_slice_792 : tensor<1x32x40x64xf32>) outs(%3422 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %3424 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_793 = tensor.insert_slice %3423 into %3424[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_794 = tensor.insert_slice %extracted_slice_791 into %inserted_slice_793[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %3425 = tosa.mul %inserted_slice_794, %3420 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3426 = tosa.add %3421, %3425 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3427 = tosa.mul %3415, %3419 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_795 = tensor.extract_slice %3415[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_796 = tensor.extract_slice %3415[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %3428 = tensor.empty() : tensor<1x32x40x64xf32>
-    %3429 = linalg.negf ins(%extracted_slice_796 : tensor<1x32x40x64xf32>) outs(%3428 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %3430 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_797 = tensor.insert_slice %3429 into %3430[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_798 = tensor.insert_slice %extracted_slice_795 into %inserted_slice_797[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %3431 = tosa.mul %inserted_slice_798, %3420 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3432 = tosa.add %3427, %3431 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3433 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %3434 = tosa.reshape %3433 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_799 = tensor.extract_slice %3434[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_800 = tensor.extract_slice %extracted_slice_799[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %3435 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %3436 = tosa.add %extracted_slice_800, %3435 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_801 = tensor.extract_slice %3436[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_802 = tensor.extract_slice %extracted_slice_801[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_803 = tensor.extract_slice %extracted_slice_802[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_804 = tensor.extract_slice %extracted_slice_803[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_805 = arith.constant 0.000000e+00 : f32
-    %splat_806 = tensor.splat %cst_805 : tensor<40x40xf32>
-    %3437 = tosa.reshape %extracted_slice_804 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %3438 = tosa.add %splat_806, %3437 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %3439 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3440 = tosa.transpose %3432, %3439 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %3441 = tosa.reshape %3426 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3442 = tosa.reshape %3440 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %3443 = tosa.matmul %3441, %3442 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_807 = arith.constant 0.0883883461 : f32
-    %splat_808 = tensor.splat %cst_807 : tensor<32x40x40xf32>
-    %3444 = tosa.mul %3443, %splat_808 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %3445 = tosa.add %3444, %3438 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %3446 = tosa.reduce_max %3445 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %3447 = tosa.sub %3445, %3446 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %3448 = math.exp %3447 : tensor<32x40x40xf32>
-    %3449 = tosa.reduce_sum %3448 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %3450 = tosa.log %3449 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %3451 = tosa.add %3446, %3450 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %3452 = tosa.sub %3445, %3451 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %3453 = math.exp %3452 : tensor<32x40x40xf32>
-    %3454 = tosa.reshape %3451 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %3455 = tosa.reshape %3418 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3456 = tosa.matmul %3453, %3455 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3457 = tosa.reshape %3456 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3458 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3459 = tosa.transpose %3457, %3458 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %3460 = tosa.reshape %3459 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %3461 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3462 = tosa.transpose %arg268, %3461 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3463 = tosa.reshape %3460 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_809 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3464 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3463, %3462 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_809 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3465 = tosa.reshape %3464 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3466 = tosa.add %3382, %3465 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3467 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_810 = arith.constant 2 : i32
-    %3468 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3466 : tensor<1x40x4096xf32>) outs(%3467 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_810 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %3469 = tosa.reduce_sum %3468 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %3470 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %3471 = tosa.reciprocal %3470 : (tensor<1xf32>) -> tensor<1xf32>
-    %3472 = tosa.mul %3471, %3469 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3473 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %3474 = tosa.add %3472, %3473 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3475 = tosa.rsqrt %3474 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3476 = tosa.mul %3466, %3475 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %3477 = tosa.reshape %arg269 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %3478 = tosa.mul %3477, %3476 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3479 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3480 = tosa.transpose %arg270, %3479 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %3481 = tosa.reshape %3478 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_811 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %3482 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3481, %3480 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_811 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %3483 = tosa.reshape %3482 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3484 = tosa.sigmoid %3483 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3485 = tosa.mul %3483, %3484 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3486 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3487 = tosa.transpose %arg271, %3486 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %3488 = tosa.reshape %3478 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_812 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %3489 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3488, %3487 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_812 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %3490 = tosa.reshape %3489 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3491 = tosa.mul %3485, %3490 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3492 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3493 = tosa.transpose %arg272, %3492 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %3494 = tosa.reshape %3491 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_813 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3495 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3494, %3493 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_813 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3496 = tosa.reshape %3495 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3497 = tosa.add %3466, %3496 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3498 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_814 = arith.constant 2 : i32
-    %3499 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3497 : tensor<1x40x4096xf32>) outs(%3498 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_814 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %3500 = tosa.reduce_sum %3499 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %3501 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %3502 = tosa.reciprocal %3501 : (tensor<1xf32>) -> tensor<1xf32>
-    %3503 = tosa.mul %3502, %3500 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3504 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %3505 = tosa.add %3503, %3504 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3506 = tosa.rsqrt %3505 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3507 = tosa.mul %3497, %3506 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %3508 = tosa.reshape %arg273 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %3509 = tosa.mul %3508, %3507 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3510 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3511 = tosa.transpose %arg274, %3510 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3512 = tosa.reshape %3509 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_815 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3513 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3512, %3511 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_815 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3514 = tosa.reshape %3513 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3515 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3516 = tosa.transpose %arg275, %3515 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3517 = tosa.reshape %3509 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_816 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3518 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3517, %3516 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_816 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3519 = tosa.reshape %3518 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3520 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3521 = tosa.transpose %arg276, %3520 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3522 = tosa.reshape %3509 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_817 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3523 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3522, %3521 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_817 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3524 = tosa.reshape %3523 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3525 = tosa.reshape %3514 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3526 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3527 = tosa.transpose %3525, %3526 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3528 = tosa.reshape %3519 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3529 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3530 = tosa.transpose %3528, %3529 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3531 = tosa.reshape %3524 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3532 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3533 = tosa.transpose %3531, %3532 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3534 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %3535 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %3536 = tosa.mul %3527, %3534 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_818 = tensor.extract_slice %3527[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_819 = tensor.extract_slice %3527[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %3537 = tensor.empty() : tensor<1x32x40x64xf32>
-    %3538 = linalg.negf ins(%extracted_slice_819 : tensor<1x32x40x64xf32>) outs(%3537 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %3539 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_820 = tensor.insert_slice %3538 into %3539[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_821 = tensor.insert_slice %extracted_slice_818 into %inserted_slice_820[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %3540 = tosa.mul %inserted_slice_821, %3535 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3541 = tosa.add %3536, %3540 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3542 = tosa.mul %3530, %3534 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_822 = tensor.extract_slice %3530[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_823 = tensor.extract_slice %3530[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %3543 = tensor.empty() : tensor<1x32x40x64xf32>
-    %3544 = linalg.negf ins(%extracted_slice_823 : tensor<1x32x40x64xf32>) outs(%3543 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %3545 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_824 = tensor.insert_slice %3544 into %3545[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_825 = tensor.insert_slice %extracted_slice_822 into %inserted_slice_824[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %3546 = tosa.mul %inserted_slice_825, %3535 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3547 = tosa.add %3542, %3546 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3548 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %3549 = tosa.reshape %3548 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_826 = tensor.extract_slice %3549[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_827 = tensor.extract_slice %extracted_slice_826[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %3550 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %3551 = tosa.add %extracted_slice_827, %3550 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_828 = tensor.extract_slice %3551[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_829 = tensor.extract_slice %extracted_slice_828[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_830 = tensor.extract_slice %extracted_slice_829[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_831 = tensor.extract_slice %extracted_slice_830[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_832 = arith.constant 0.000000e+00 : f32
-    %splat_833 = tensor.splat %cst_832 : tensor<40x40xf32>
-    %3552 = tosa.reshape %extracted_slice_831 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %3553 = tosa.add %splat_833, %3552 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %3554 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3555 = tosa.transpose %3547, %3554 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %3556 = tosa.reshape %3541 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3557 = tosa.reshape %3555 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %3558 = tosa.matmul %3556, %3557 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_834 = arith.constant 0.0883883461 : f32
-    %splat_835 = tensor.splat %cst_834 : tensor<32x40x40xf32>
-    %3559 = tosa.mul %3558, %splat_835 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %3560 = tosa.add %3559, %3553 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %3561 = tosa.reduce_max %3560 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %3562 = tosa.sub %3560, %3561 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %3563 = math.exp %3562 : tensor<32x40x40xf32>
-    %3564 = tosa.reduce_sum %3563 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %3565 = tosa.log %3564 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %3566 = tosa.add %3561, %3565 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %3567 = tosa.sub %3560, %3566 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %3568 = math.exp %3567 : tensor<32x40x40xf32>
-    %3569 = tosa.reshape %3566 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %3570 = tosa.reshape %3533 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3571 = tosa.matmul %3568, %3570 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3572 = tosa.reshape %3571 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3573 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3574 = tosa.transpose %3572, %3573 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %3575 = tosa.reshape %3574 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %3576 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3577 = tosa.transpose %arg277, %3576 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3578 = tosa.reshape %3575 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_836 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3579 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3578, %3577 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_836 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3580 = tosa.reshape %3579 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3581 = tosa.add %3497, %3580 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3582 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_837 = arith.constant 2 : i32
-    %3583 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3581 : tensor<1x40x4096xf32>) outs(%3582 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_837 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %3584 = tosa.reduce_sum %3583 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %3585 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %3586 = tosa.reciprocal %3585 : (tensor<1xf32>) -> tensor<1xf32>
-    %3587 = tosa.mul %3586, %3584 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3588 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %3589 = tosa.add %3587, %3588 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3590 = tosa.rsqrt %3589 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3591 = tosa.mul %3581, %3590 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %3592 = tosa.reshape %arg278 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %3593 = tosa.mul %3592, %3591 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3594 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3595 = tosa.transpose %arg279, %3594 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %3596 = tosa.reshape %3593 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_838 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %3597 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3596, %3595 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_838 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %3598 = tosa.reshape %3597 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3599 = tosa.sigmoid %3598 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3600 = tosa.mul %3598, %3599 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3601 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3602 = tosa.transpose %arg280, %3601 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %3603 = tosa.reshape %3593 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_839 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %3604 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3603, %3602 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_839 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %3605 = tosa.reshape %3604 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3606 = tosa.mul %3600, %3605 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3607 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3608 = tosa.transpose %arg281, %3607 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %3609 = tosa.reshape %3606 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_840 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3610 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3609, %3608 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_840 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3611 = tosa.reshape %3610 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3612 = tosa.add %3581, %3611 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3613 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_841 = arith.constant 2 : i32
-    %3614 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3612 : tensor<1x40x4096xf32>) outs(%3613 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_841 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %3615 = tosa.reduce_sum %3614 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %3616 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %3617 = tosa.reciprocal %3616 : (tensor<1xf32>) -> tensor<1xf32>
-    %3618 = tosa.mul %3617, %3615 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3619 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %3620 = tosa.add %3618, %3619 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3621 = tosa.rsqrt %3620 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3622 = tosa.mul %3612, %3621 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %3623 = tosa.reshape %arg282 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %3624 = tosa.mul %3623, %3622 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3625 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3626 = tosa.transpose %arg283, %3625 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3627 = tosa.reshape %3624 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_842 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3628 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3627, %3626 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_842 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3629 = tosa.reshape %3628 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3630 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3631 = tosa.transpose %arg284, %3630 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3632 = tosa.reshape %3624 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_843 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3633 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3632, %3631 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_843 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3634 = tosa.reshape %3633 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3635 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3636 = tosa.transpose %arg285, %3635 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3637 = tosa.reshape %3624 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_844 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3638 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3637, %3636 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_844 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3639 = tosa.reshape %3638 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3640 = tosa.reshape %3629 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3641 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3642 = tosa.transpose %3640, %3641 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3643 = tosa.reshape %3634 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3644 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3645 = tosa.transpose %3643, %3644 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3646 = tosa.reshape %3639 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
-    %3647 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3648 = tosa.transpose %3646, %3647 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
-    %3649 = tosa.reshape %45 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %3650 = tosa.reshape %47 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
-    %3651 = tosa.mul %3642, %3649 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_845 = tensor.extract_slice %3642[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_846 = tensor.extract_slice %3642[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %3652 = tensor.empty() : tensor<1x32x40x64xf32>
-    %3653 = linalg.negf ins(%extracted_slice_846 : tensor<1x32x40x64xf32>) outs(%3652 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %3654 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_847 = tensor.insert_slice %3653 into %3654[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_848 = tensor.insert_slice %extracted_slice_845 into %inserted_slice_847[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %3655 = tosa.mul %inserted_slice_848, %3650 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3656 = tosa.add %3651, %3655 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3657 = tosa.mul %3645, %3649 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %extracted_slice_849 = tensor.extract_slice %3645[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %extracted_slice_850 = tensor.extract_slice %3645[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
-    %3658 = tensor.empty() : tensor<1x32x40x64xf32>
-    %3659 = linalg.negf ins(%extracted_slice_850 : tensor<1x32x40x64xf32>) outs(%3658 : tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
-    %3660 = tensor.empty() : tensor<1x32x40x128xf32>
-    %inserted_slice_851 = tensor.insert_slice %3659 into %3660[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %inserted_slice_852 = tensor.insert_slice %extracted_slice_849 into %inserted_slice_851[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
-    %3661 = tosa.mul %inserted_slice_852, %3650 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3662 = tosa.add %3657, %3661 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3663 = tosa.reshape %19 {new_shape = array<i64: 1, 40, 41>} : (tensor<40x41xf32>) -> tensor<1x40x41xf32>
-    %3664 = tosa.reshape %3663 {new_shape = array<i64: 1, 1, 40, 41>} : (tensor<1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_853 = tensor.extract_slice %3664[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_854 = tensor.extract_slice %extracted_slice_853[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %3665 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x41xf32>}> : () -> tensor<1x1x40x41xf32>
-    %3666 = tosa.add %extracted_slice_854, %3665 : (tensor<1x1x40x41xf32>, tensor<1x1x40x41xf32>) -> tensor<1x1x40x41xf32>
-    %extracted_slice_855 = tensor.extract_slice %3666[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_856 = tensor.extract_slice %extracted_slice_855[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_857 = tensor.extract_slice %extracted_slice_856[0, 0, 0, 0] [1, 1, 40, 41] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x41xf32>
-    %extracted_slice_858 = tensor.extract_slice %extracted_slice_857[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x41xf32> to tensor<1x1x40x40xf32>
-    %cst_859 = arith.constant 0.000000e+00 : f32
-    %splat_860 = tensor.splat %cst_859 : tensor<40x40xf32>
-    %3667 = tosa.reshape %extracted_slice_858 {new_shape = array<i64: 40, 40>} : (tensor<1x1x40x40xf32>) -> tensor<40x40xf32>
-    %3668 = tosa.add %splat_860, %3667 : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
-    %3669 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3670 = tosa.transpose %3662, %3669 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
-    %3671 = tosa.reshape %3656 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3672 = tosa.reshape %3670 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
-    %3673 = tosa.matmul %3671, %3672 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
-    %cst_861 = arith.constant 0.0883883461 : f32
-    %splat_862 = tensor.splat %cst_861 : tensor<32x40x40xf32>
-    %3674 = tosa.mul %3673, %splat_862 {shift = 0 : i8} : (tensor<32x40x40xf32>, tensor<32x40x40xf32>) -> tensor<32x40x40xf32>
-    %3675 = tosa.add %3674, %3668 : (tensor<32x40x40xf32>, tensor<40x40xf32>) -> tensor<32x40x40xf32>
-    %3676 = tosa.reduce_max %3675 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %3677 = tosa.sub %3675, %3676 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %3678 = math.exp %3677 : tensor<32x40x40xf32>
-    %3679 = tosa.reduce_sum %3678 {axis = 2 : i32} : (tensor<32x40x40xf32>) -> tensor<32x40x1xf32>
-    %3680 = tosa.log %3679 : (tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %3681 = tosa.add %3676, %3680 : (tensor<32x40x1xf32>, tensor<32x40x1xf32>) -> tensor<32x40x1xf32>
-    %3682 = tosa.sub %3675, %3681 : (tensor<32x40x40xf32>, tensor<32x40x1xf32>) -> tensor<32x40x40xf32>
-    %3683 = math.exp %3682 : tensor<32x40x40xf32>
-    %3684 = tosa.reshape %3681 {new_shape = array<i64: 1, 32, 40>} : (tensor<32x40x1xf32>) -> tensor<1x32x40xf32>
-    %3685 = tosa.reshape %3648 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3686 = tosa.matmul %3683, %3685 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
-    %3687 = tosa.reshape %3686 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
-    %3688 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %3689 = tosa.transpose %3687, %3688 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
-    %3690 = tosa.reshape %3689 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
-    %3691 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3692 = tosa.transpose %arg286, %3691 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
-    %3693 = tosa.reshape %3690 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_863 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3694 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3693, %3692 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_863 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3695 = tosa.reshape %3694 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3696 = tosa.add %3612, %3695 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3697 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_864 = arith.constant 2 : i32
-    %3698 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3696 : tensor<1x40x4096xf32>) outs(%3697 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_864 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %3699 = tosa.reduce_sum %3698 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %3700 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %3701 = tosa.reciprocal %3700 : (tensor<1xf32>) -> tensor<1xf32>
-    %3702 = tosa.mul %3701, %3699 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3703 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %3704 = tosa.add %3702, %3703 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3705 = tosa.rsqrt %3704 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3706 = tosa.mul %3696, %3705 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %3707 = tosa.reshape %arg287 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %3708 = tosa.mul %3707, %3706 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3709 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3710 = tosa.transpose %arg288, %3709 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %3711 = tosa.reshape %3708 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_865 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %3712 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3711, %3710 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_865 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %3713 = tosa.reshape %3712 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3714 = tosa.sigmoid %3713 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3715 = tosa.mul %3713, %3714 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3716 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3717 = tosa.transpose %arg289, %3716 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
-    %3718 = tosa.reshape %3708 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_866 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
-    %3719 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3718, %3717 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_866 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
-    %3720 = tosa.reshape %3719 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3721 = tosa.mul %3715, %3720 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
-    %3722 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3723 = tosa.transpose %arg290, %3722 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
-    %3724 = tosa.reshape %3721 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
-    %cst_867 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
-    %3725 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3724, %3723 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_867 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
-    %3726 = tosa.reshape %3725 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3727 = tosa.add %3696, %3726 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %3728 = tensor.empty() : tensor<1x40x4096xf32>
-    %c2_i32_868 = arith.constant 2 : i32
-    %3729 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3727 : tensor<1x40x4096xf32>) outs(%3728 : tensor<1x40x4096xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %3745 = math.fpowi %in, %c2_i32_868 : f32, i32
-      linalg.yield %3745 : f32
-    } -> tensor<1x40x4096xf32>
-    %3730 = tosa.reduce_sum %3729 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
-    %3731 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %3732 = tosa.reciprocal %3731 : (tensor<1xf32>) -> tensor<1xf32>
-    %3733 = tosa.mul %3732, %3730 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3734 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
-    %3735 = tosa.add %3733, %3734 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3736 = tosa.rsqrt %3735 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
-    %3737 = tosa.mul %3727, %3736 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
-    %3738 = tosa.reshape %arg291 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
-    %3739 = tosa.mul %3738, %3737 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
-    %extracted_slice_869 = tensor.extract_slice %3739[0, 0, 0] [1, 40, 4096] [1, 1, 1] : tensor<1x40x4096xf32> to tensor<1x40x4096xf32>
-    %extracted_slice_870 = tensor.extract_slice %extracted_slice_869[0, 0, 0] [1, 40, 4096] [1, 1, 1] : tensor<1x40x4096xf32> to tensor<1x40x4096xf32>
-    %extracted_slice_871 = tensor.extract_slice %extracted_slice_870[0, 0, 0] [1, 40, 4096] [1, 1, 1] : tensor<1x40x4096xf32> to tensor<1x40x4096xf32>
-    %3740 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-    %3741 = tosa.transpose %arg292, %3740 : (tensor<32000x4096xf32>, tensor<2xi32>) -> tensor<4096x32000xf32>
-    %3742 = tosa.reshape %extracted_slice_871 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
-    %cst_872 = arith.constant dense<0.000000e+00> : tensor<40x32000xf32>
-    %3743 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3742, %3741 : tensor<40x4096xf32>, tensor<4096x32000xf32>) outs(%cst_872 : tensor<40x32000xf32>) -> tensor<40x32000xf32>
-    %3744 = tosa.reshape %3743 {new_shape = array<i64: 1, 40, 32000>} : (tensor<40x32000xf32>) -> tensor<1x40x32000xf32>
-    return %3744 : tensor<1x40x32000xf32>
-  }
-}
-
diff --git a/examples/LlamaTest/llama-import.py b/examples/LlamaTest/llama-import.py
index 552fc9dc..d4aa3631 100644
--- a/examples/LlamaTest/llama-import.py
+++ b/examples/LlamaTest/llama-import.py
@@ -100,7 +100,7 @@ def forward(self, input_ids, attention_mask, position_ids):
 graph = graphs[0]
 params = dynamo_compiler.imported_params[graph]
 
-driver = GraphDriver(graphs[0])
+driver = GraphDriver(graphs[0], 2)
 for i in range(len(driver.subgraphs)):
     driver.subgraphs[i].lower_to_top_level_ir()
 driver.construct_main_graph(True)
diff --git a/examples/SplitLlama/CMakeLists.txt b/examples/SplitLlama/CMakeLists.txt
index 9ec9ef61..6ade6cc0 100644
--- a/examples/SplitLlama/CMakeLists.txt
+++ b/examples/SplitLlama/CMakeLists.txt
@@ -566,7 +566,7 @@ add_custom_command(
     COMMENT "Building subgraph193.o "
     VERBATIM)
 
-set(Boost_INCLUDE_DIR "/home/chenweiwei/boost_1_86_0/include")
+# set(Boost_INCLUDE_DIR "/home/chenweiwei/boost_1_86_0/include")
 add_library(SPLITLLAMA5 STATIC forward193.o subgraph193.o)
 
 # 查找 Boost 库
diff --git a/examples/SplitLlama/llama-import.py b/examples/SplitLlama/llama-import.py
index 04d47241..3594fd6c 100644
--- a/examples/SplitLlama/llama-import.py
+++ b/examples/SplitLlama/llama-import.py
@@ -100,7 +100,7 @@ def forward(self, input_ids, attention_mask, position_ids):
 graph = graphs[0]
 params = dynamo_compiler.imported_params[graph]
 
-driver = GraphDriver(graphs[0])
+driver = GraphDriver(graphs[0], 2)
 for i in range(len(driver.subgraphs)):
     driver.subgraphs[i].lower_to_top_level_ir()
 
diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py
index 8c5c11e8..5315eaf9 100644
--- a/frontend/Python/graph/graph.py
+++ b/frontend/Python/graph/graph.py
@@ -68,7 +68,7 @@ class OutputDescriptor(ctypes.Structure):
 
     return OutputDescriptor
 
-
+# Graph类, 表示Buddy编译器前端的图级表达式。
 class Graph:
     """
     Graph is a graph-level expression for the Buddy Compiler frontends.
@@ -136,7 +136,8 @@ def __init__(
         self._output_memref = None
         self._output_descriptor = None
         self.execution_engine = None
-        self.paral_group: Dict[str, List[int]] = {}
+        self.op_groups: Dict[str, List[Op]] = {}
+        self.group_map_device: Dict[str, DeviceType] = {}
 
     @property
     def body(self):
@@ -514,8 +515,7 @@ def addsymbol(self) -> None:
         """
         for key, value in self._symbol_table.items():
             print(f"Key: {key}, Value: {value}")
-            
-
+    
     def import_graph(self) -> ir.Module:
         """
         Imports buddy graph and generates an MLIR module in high-level dialects.
@@ -524,6 +524,7 @@ def import_graph(self) -> ir.Module:
             mlir.ir.Module: An MLIR module in high-level dialects.
         """
         assert self._do_param_pack == False
+        # 创建一个新Module, 根据计算图中的算子添加对应的MLIR操作
         with ir.InsertionPoint(self._module.body):
             arguments = []
             inputs = self._params + self._inputs
@@ -538,7 +539,7 @@ def import_graph(self) -> ir.Module:
                 if isinstance(node, FuncOp):
                     extern_func.append(node)
                     self._import_op(node)
-
+            # 
             @func.FuncOp.from_py_func(*arguments, name=self._func_name)
             def generated_func(*args):
                 args_list = list(args)
@@ -590,6 +591,7 @@ def import_main_graph(self) -> ir.Module:
         Returns:
             mlir.ir.Module: An MLIR module in high-level dialects.
         """
+        # 创建一个新Module, 根据计算图中的算子添加对应的MLIR操作
         with ir.InsertionPoint(self._module.body):
             arguments = []
             if self._do_param_pack:
@@ -610,10 +612,13 @@ def import_main_graph(self) -> ir.Module:
                     extern_func.append(node)
                     self._import_op(node)
 
+            # 将下方的Python函数包装为MLIR中的函数操作（FuncOp）
             @func.FuncOp.from_py_func(*arguments, name=self._func_name)
             def generated_func(*args):
                 args_list = list(args)
+                # 遍历计算图的节点进行针对性处理
                 for node in self._body:
+                    # 外部函数无需处理
                     if node in extern_func:
                         continue
                     if isinstance(node, OutputOp):
@@ -628,6 +633,7 @@ def generated_func(*args):
                             node.tensor_meta['shape'] = torch.Size(list(node._newshape))
                         self._import_placeholder(node, args_list)
                     elif isinstance(node, GetItemOp):
+                        # print(self._symbol_table)
                         self._symbol_table[(str(node.name), 0)] = (
                             self._symbol_table[
                                 (str(node.args[0]), node.args[1])
@@ -635,7 +641,7 @@ def generated_func(*args):
                         )
                     else:
                         self._import_op(node)
-
+                    
                 return self._symbol_table.get(("output", 0))
 
         return self._module
@@ -693,9 +699,11 @@ def _import_op(self, node: Op):
 
         """
         op_name = node.__class__.__name__
+        # 根据算子类型自动调用MLIR操作注册表中的对应函数生成MLIR操作
         op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = (
             self._ops_registry[op_name](node, self._symbol_table)
         )
+        # 根据返回值类型将MLIR操作结果添加到符号表中
         if isinstance(op_ret, tuple | List):
             for i, operation in enumerate(op_ret):
                 if isinstance(operation, ir.Operation) or isinstance(
diff --git a/frontend/Python/graph/graph_driver.py b/frontend/Python/graph/graph_driver.py
index 2df1d6a9..71426506 100644
--- a/frontend/Python/graph/graph_driver.py
+++ b/frontend/Python/graph/graph_driver.py
@@ -50,7 +50,7 @@ class GraphDriver:
     - _subgraphs_outputs (dict): A dictionary mapping subgraph names to their
     output op's result.
     """
-    def __init__(self, graph: Graph, parallelism: int = 2) -> None:
+    def __init__(self, graph: Graph, parallelism: int = 1) -> None:
         """
         Initialize the GraphDriver object with a given computational graph.
 
@@ -64,8 +64,8 @@ def __init__(self, graph: Graph, parallelism: int = 2) -> None:
         self._graph = graph
         self._parallelism = parallelism
         # 对原图的操作分组
-        self.op_groups: Dict[str, List[Op]] = {}
-        self.group_map_device: Dict[str, DeviceType] = {}
+        self.op_groups = self._graph.op_groups
+        self.group_map_device = self._graph.group_map_device
         self._subgraph_dependencies = {}
         self._paral_op_shape: Dict[str, List[int]] = {}
         (
@@ -99,9 +99,9 @@ def subgraph_param_indices(self):
     
     def _add_paral_op_shape(self, op_name, shape):
         if op_name not in self._paral_op_shape.keys():
-            # print(op_name, shape)
             self._paral_op_shape[op_name] = shape
 
+    # 统一两个有不同维度的数据的维度, 低维数据扩展到高维
     def _normalize_binary_operator_shape(self, shp1, shp2):
         """Normalize the shape of two input tensors according to the broadcasting
         rule"""
@@ -114,7 +114,8 @@ def _normalize_binary_operator_shape(self, shp1, shp2):
 
         return shp1, shp2
     
-    def _infer_new_shape_with_neg_one(self, old_shape, new_shape):
+    # 当输出数据的shape中有-1时, 根据输入数据的shape推断输出数据的shape,
+    def _infer_new_shape(self, old_shape, new_shape):
         total_size = 1
         for dim_siz in old_shape:
             total_size *= dim_siz
@@ -136,6 +137,7 @@ def _infer_new_shape_with_neg_one(self, old_shape, new_shape):
                     new_shape[i] = infer_dim_size
         return new_shape
     
+    # 获取参数的总大小
     def get_pack_params_size(self, tensors_meta: list[TensorMeta]) -> int:
         param_total_size = 0
         for tensor_meta in tensors_meta:
@@ -143,86 +145,96 @@ def get_pack_params_size(self, tensors_meta: list[TensorMeta]) -> int:
                 lambda x, y: x * y, list(tensor_meta.shape), 1
             )
         return param_total_size
+    
+    # 生成模型的分割策略，并根据这个策略推断出每个子模块的输入和输出参数
     def get_split_strategy(self):
-        """
-        Group ops based on the computational graph in terms of subgraphs.
-        
-        Analyse the inputs and outputs of each subgraph.
-
-        Update the shape information of the nodes in each subgraph 
-        associated with the weight matrix to be split.
-
-        Returns:
-        - None
-        """
+      """
+      Group ops based on the computational graph in terms of subgraphs.
+      
+      Analyse the inputs and outputs of each subgraph.
+
+      Update the shape information of the nodes in each subgraph 
+      associated with the weight matrix to be split.
+
+      Returns:
+      - None
+      """
+      # 对是否需要分割两种情况进行处理
+      if self._parallelism < 1:
+        raise ValueError(
+          "Parallelism must be greater than or equal to 0")
+      elif self._parallelism > 1:
+        self.op_groups = {}
+        self.group_map_device = {}
         # 对计算图的op进行分组，分组策略为：以PowOp为间隔放在一个subgraph中，忽略PlaceholderOp和OutputOp
         submodel_count = 0
         ops_count = [6, 50, 2, 6, 14, 2]
         pow_count = 0
         tsf_count = 0
         for i, op in enumerate(self._graph._body):
-            if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp):
-                continue
-            
-            if "subgraph{}".format(submodel_count) not in self.op_groups.keys():
-                subgraph_name = "subgraph{}".format(submodel_count)
-                self.group_map_device[subgraph_name] = DeviceType.CPU
-                self.op_groups[subgraph_name] = [op]
-                continue
-            
-            # todo: Added handling of more complex embedding cases
-            if isinstance(op, PowOp): 
-                pow_count += 1
-                submodel_count += 1 
-                tsf_count = 1
-                subgraph_name = "subgraph{}".format(submodel_count)
-                self.group_map_device[subgraph_name] = DeviceType.CPU
-                self.op_groups[subgraph_name] = [op]
-                continue
-            
-            if pow_count > 0 and pow_count < 65:
-                if tsf_count == ops_count[(submodel_count-1)%6]:
-                    tsf_count = 1
-                    submodel_count += 1
-                    subgraph_name = "subgraph{}".format(submodel_count)
-                    self.group_map_device[subgraph_name] = DeviceType.CPU
-                    self.op_groups[subgraph_name] = [op]
-                    continue
-                else:
-                    tsf_count += 1
-
+          if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp):
+            continue
+          
+          if "subgraph{}".format(submodel_count) not in self.op_groups.keys():
             subgraph_name = "subgraph{}".format(submodel_count)
-            group = self.op_groups[subgraph_name]
-            group.append(op)
-            self.op_groups[subgraph_name] = group
-
-        self._subgraph_dependencies = {
-            subgraph_name: set()
-            for subgraph_name in list(self.op_groups.keys())
-        }
-
-        # 识别每个子图的输入节点，并将这些输入节点存储在subgraphs_inputs中
-        # 每个子图的输入节点是那些不属于当前子图但与当前子图中的操作有依赖关系的节点。
-        subgraphs_inputs = {}
-
-        # 预置每个权重矩阵的分割位置
-        paral_pos0 = [-1, -1, -1]
-        paral_pos1 = [[1, -1], [0, -1, 0, 0, -1, -1, -1, 1], [0, 1], [1, -1], [0, -1, 0, 1], [0, 1]]
-
-        # Identify inputs for each subgraph
-        for i, subgraph_name in enumerate(self.op_groups.keys()):
-            subgraphs_inputs[subgraph_name] = []
-            if(i == 0 or i == 193):
-                paral_pos = paral_pos0
+            self.group_map_device[subgraph_name] = DeviceType.CPU
+            self.op_groups[subgraph_name] = [op]
+            continue
+          
+          # todo: Added handling of more complex embedding cases
+          if isinstance(op, PowOp): 
+            pow_count += 1
+            submodel_count += 1 
+            tsf_count = 1
+            subgraph_name = "subgraph{}".format(submodel_count)
+            self.group_map_device[subgraph_name] = DeviceType.CPU
+            self.op_groups[subgraph_name] = [op]
+            continue
+          
+          if pow_count > 0 and pow_count < 65:
+            if tsf_count == ops_count[(submodel_count-1)%6]:
+              tsf_count = 1
+              submodel_count += 1
+              subgraph_name = "subgraph{}".format(submodel_count)
+              self.group_map_device[subgraph_name] = DeviceType.CPU
+              self.op_groups[subgraph_name] = [op]
+              continue
             else:
-                paral_pos = paral_pos1[(i-1)%6]
-            input_count = 0
-            for op in self.op_groups[subgraph_name]:
-                for parent in op._parents:
-                    op_parent = self._graph.node_table[parent]
-                    if ( op_parent not in self.op_groups[subgraph_name]
-                    ) and (op_parent not in subgraphs_inputs[subgraph_name]):
-                        subgraphs_inputs[subgraph_name].append(op_parent)
+              tsf_count += 1
+
+          subgraph_name = "subgraph{}".format(submodel_count)
+          group = self.op_groups[subgraph_name]
+          group.append(op)
+          self.op_groups[subgraph_name] = group
+          
+      self._subgraph_dependencies = {
+          subgraph_name: set()
+          for subgraph_name in list(self.op_groups.keys())
+      }
+
+      # 识别每个子图的输入节点，并将这些输入节点存储在subgraphs_inputs中
+      # 每个子图的输入节点是那些不属于当前子图但与当前子图中的操作有依赖关系的节点。
+      subgraphs_inputs = {}
+
+      # 预置每个权重矩阵的分割位置
+      paral_pos0 = [-1, -1, -1]
+      paral_pos1 = [[1, -1], [0, -1, 0, 0, -1, -1, -1, 1], [0, 1], [1, -1], [0, -1, 0, 1], [0, 1]]
+
+      # Identify inputs for each subgraph
+      for i, subgraph_name in enumerate(self.op_groups.keys()):
+          subgraphs_inputs[subgraph_name] = []
+          if(i == 0 or i == 193):
+              paral_pos = paral_pos0
+          else:
+              paral_pos = paral_pos1[(i-1)%6]
+          input_count = 0
+          for op in self.op_groups[subgraph_name]:
+              for parent in op._parents:
+                  op_parent = self._graph.node_table[parent]
+                  if ( op_parent not in self.op_groups[subgraph_name]
+                  ) and (op_parent not in subgraphs_inputs[subgraph_name]):
+                      subgraphs_inputs[subgraph_name].append(op_parent)
+                      if self._parallelism > 1:
                         op_parent_shape = list(op_parent.tensor_meta["shape"])
                         pos = paral_pos[input_count]
                         input_count += 1 
@@ -230,134 +242,137 @@ def get_split_strategy(self):
                             op_parent_shape[pos] = op_parent_shape[pos] // self._parallelism
                             self._add_paral_op_shape(parent, op_parent_shape)
 
-        subgraphs_outputs = {}
-        output_node = []
-        
-        # 识别整个图的输出节点，并将这些输出节点存储在output_node列表中，收集整个图的所有输出节点
-        # Identify output nodes of the entire graph
-        for node in self._graph.body:
-            if isinstance(node, OutputOp):
-                for arg in node.args:
-                    if(arg not in output_node):
-                        output_node.append(arg)
-
-        # 识别每个子图的输出节点，并建立子图之间的依赖关系。
-        # Identify outputs for each subgraph and build dependencies between subgraphs
-        for subgraph_name in self.op_groups.keys():
-            subgraphs_outputs[subgraph_name] = []
-            for op in self.op_groups[subgraph_name]:
-                for key in subgraphs_inputs.keys():
-                    if op in subgraphs_inputs[key]:
-                        if(op not in subgraphs_outputs[subgraph_name]):
-                            subgraphs_outputs[subgraph_name].append(op)
-                        self._subgraph_dependencies[subgraph_name].add(key)
-                if (op.name in output_node) and (
-                    op not in subgraphs_outputs[subgraph_name]
-                ):
-                    subgraphs_outputs[subgraph_name].append(op)
-        
-        # 更新每个子图中与需要拆分的权重矩阵相关的节点的shape信息
-        for subgraph_name in self.op_groups.keys():
-            new_shape = []
-            for node in self.op_groups[subgraph_name]:
-                if isinstance(node, PermuteOp) and node != self.op_groups[subgraph_name][-1]:
-                    if node.args[0] in self._paral_op_shape.keys():
-                        old_shape = self._paral_op_shape[node.args[0]]
-                        new_shape = [old_shape[index] for index in node.args[1]]
-                        if node != self.op_groups[subgraph_name][-1]:
-                            self._add_paral_op_shape(node.name, new_shape)
-                elif isinstance(node, MatmulOp) and node != self.op_groups[subgraph_name][-1]:
-                    assert len(node.args) == 2
-                    # 由于MatmulOp的输入参数是其他op的结果，所以无法通过两个参数的shape来预测出结果shape
-                    for op_arg in node.args:
-                        if op_arg in self._paral_op_shape.keys():
-                            if(node.args[0] in self._paral_op_shape.keys()):
-                                input1_shape = self._paral_op_shape[node.args[0]]
-                            else:
-                                input1 = self._graph.node_table[node.args[0]]
-                                input1_shape = list(input1.tensor_meta["shape"])
-                            if(node.args[1] in self._paral_op_shape.keys()):
-                                input2_shape = self._paral_op_shape[node.args[1]]
-                            else:
-                                input2 = self._graph.node_table[node.args[1]]
-                                input2_shape = list(input2.tensor_meta["shape"])
-                            new_shape = input1_shape
-                            new_shape[-1] = input2_shape[-1]
-                            self._add_paral_op_shape(node.name, new_shape)
-                            break
-                elif isinstance(node, AddOp | SubOp | MulOp | DivOp) and node != self.op_groups[subgraph_name][-1]:
-                    assert len(node.args) == 2
-                    # 由于MatmulOp的输入参数是其他op的结果，所以无法通过两个参数的shape来预测出结果shape
-                    for i, op_arg in enumerate(node.args):
-                        if op_arg in self._paral_op_shape.keys():
-                            broadcasted_result_shp = []
-                            if isinstance(node.args[1-i], int | float):
-                                broadcasted_result_shp = self._paral_op_shape[op_arg]
-                            else:
-                                if(node.args[0] in self._paral_op_shape.keys()):
-                                    input1_shape = self._paral_op_shape[node.args[0]]
-                                else:
-                                    input1 = self._graph.node_table[node.args[0]]
-                                    input1_shape = list(input1.tensor_meta["shape"])
-                                if(node.args[1] in self._paral_op_shape.keys()):
-                                    input2_shape = self._paral_op_shape[node.args[1]]
-                                else:
-                                    input2 = self._graph.node_table[node.args[1]]
-                                    input2_shape = list(input2.tensor_meta["shape"])
-                                norm_input1_shape, norm_input2_shape = self._normalize_binary_operator_shape(
-                                    input1_shape, input2_shape
-                                )
-                                for dim1, dim2 in zip(norm_input1_shape, norm_input2_shape):
-                                    broadcasted_result_shp.append(max(dim1, dim2))
-                            self._add_paral_op_shape(node.name, broadcasted_result_shp)
-                    del i
-                elif isinstance(node, ViewOp) and node != self.op_groups[subgraph_name][-1]:
-                    if node.args[0] in self._paral_op_shape.keys():
-                        old_shape = self._paral_op_shape[node.args[0]]
-                        old_len = len(old_shape)
-                        tmp_old_shape = []
-                        for i in range(old_len):
-                            if old_shape[i] != 1:
-                                tmp_old_shape.append(old_shape[i])
-                        new_shape = list(node.args[1])
-                        new_len = len(new_shape)
-                        tmp_new_shape = []
-                        for i in range(new_len):
-                            if new_shape[i] != 1:
-                                tmp_new_shape.append(new_shape[i])
-                        if len(tmp_old_shape) == len(tmp_new_shape):
-                            # todo: 待优化，当前处理方式只考虑<MxNx...> <--> <1xMxNx...>的情况
-                            if old_len < new_len:
-                                for i in range(old_len):
-                                    new_shape[i+1] = old_shape[i]
-                            else:
-                                for i in range(new_len):
-                                    new_shape[i] = old_shape[i+1]
-                        else:
-                            # todo: 待优化，当前处理方式只考虑<...xMxN> <--> <...xMxYxZ>(其中N=YxZ)的情况
-                            if old_len < new_len:
-                                new_shape[-2] = old_shape[-1] // new_shape[-1]
-                            else:
-                                new_shape = self._infer_new_shape_with_neg_one(old_shape, new_shape)
-                        self._add_paral_op_shape(node.name, new_shape)
-                elif isinstance(node, CatOp) and node != self.op_groups[subgraph_name][-1]:
-                    for op_arg in node.args[0]:
-                        op_arg = str(op_arg)
-                        if op_arg in self._paral_op_shape.keys():
-                            new_shape = self._paral_op_shape[op_arg]
-                            self._add_paral_op_shape(node.name, new_shape)
-                            break
-                else:
-                    if node != self.op_groups[subgraph_name][-1]:
-                    # 默认不属于上述的操作的算子算子都与被切分的算子的shape相同
-                        for i, op_arg in enumerate(node.args):
-                            if op_arg in self._paral_op_shape.keys():
-                                new_shape = self._paral_op_shape[op_arg]
-                                self._add_paral_op_shape(node.name, new_shape)
-                                break
-        
+      subgraphs_outputs = {}
+      output_node = []
+      
+      # 识别整个图的输出节点，并将这些输出节点存储在output_node列表中，收集整个图的所有输出节点
+      # Identify output nodes of the entire graph
+      for node in self._graph.body:
+          if isinstance(node, OutputOp):
+              for arg in node.args:
+                  if(arg not in output_node):
+                      output_node.append(arg)
+
+      # 识别每个子图的输出节点，并建立子图之间的依赖关系。
+      # Identify outputs for each subgraph and build dependencies between subgraphs
+      for subgraph_name in self.op_groups.keys():
+          subgraphs_outputs[subgraph_name] = []
+          for op in self.op_groups[subgraph_name]:
+              for key in subgraphs_inputs.keys():
+                  if op in subgraphs_inputs[key]:
+                      if(op not in subgraphs_outputs[subgraph_name]):
+                          subgraphs_outputs[subgraph_name].append(op)
+                      self._subgraph_dependencies[subgraph_name].add(key)
+              if (op.name in output_node) and (
+                  op not in subgraphs_outputs[subgraph_name]
+              ):
+                  subgraphs_outputs[subgraph_name].append(op)
+      
+      # 更新每个子图中与需要拆分的权重矩阵相关的节点的shape信息，对PermuteOp, MatmulOp, AddOp, SubOp, MulOp, DivOp, ViewOp, CatOp等会根据输入数据的shape而改变输出数据的shape的算子进行特殊处理
+      if self._parallelism == 1:
         return subgraphs_inputs, subgraphs_outputs
+      for subgraph_name in self.op_groups.keys():
+          new_shape = []
+          for node in self.op_groups[subgraph_name]:
+              if isinstance(node, PermuteOp) and node != self.op_groups[subgraph_name][-1]:
+                  if node.args[0] in self._paral_op_shape.keys():
+                      old_shape = self._paral_op_shape[node.args[0]]
+                      new_shape = [old_shape[index] for index in node.args[1]]
+                      if node != self.op_groups[subgraph_name][-1]:
+                          self._add_paral_op_shape(node.name, new_shape)
+              elif isinstance(node, MatmulOp) and node != self.op_groups[subgraph_name][-1]:
+                  assert len(node.args) == 2
+                  # 由于MatmulOp的输入参数是其他op的结果，所以无法通过两个参数的shape来预测出结果shape
+                  for op_arg in node.args:
+                      if op_arg in self._paral_op_shape.keys():
+                          if(node.args[0] in self._paral_op_shape.keys()):
+                              input1_shape = self._paral_op_shape[node.args[0]]
+                          else:
+                              input1 = self._graph.node_table[node.args[0]]
+                              input1_shape = list(input1.tensor_meta["shape"])
+                          if(node.args[1] in self._paral_op_shape.keys()):
+                              input2_shape = self._paral_op_shape[node.args[1]]
+                          else:
+                              input2 = self._graph.node_table[node.args[1]]
+                              input2_shape = list(input2.tensor_meta["shape"])
+                          new_shape = input1_shape
+                          new_shape[-1] = input2_shape[-1]
+                          self._add_paral_op_shape(node.name, new_shape)
+                          break
+              elif isinstance(node, AddOp | SubOp | MulOp | DivOp) and node != self.op_groups[subgraph_name][-1]:
+                  assert len(node.args) == 2
+                  # 由于MatmulOp的输入参数是其他op的结果，所以无法通过两个参数的shape来预测出结果shape
+                  for i, op_arg in enumerate(node.args):
+                      if op_arg in self._paral_op_shape.keys():
+                          broadcasted_result_shp = []
+                          if isinstance(node.args[1-i], int | float):
+                              broadcasted_result_shp = self._paral_op_shape[op_arg]
+                          else:
+                              if(node.args[0] in self._paral_op_shape.keys()):
+                                  input1_shape = self._paral_op_shape[node.args[0]]
+                              else:
+                                  input1 = self._graph.node_table[node.args[0]]
+                                  input1_shape = list(input1.tensor_meta["shape"])
+                              if(node.args[1] in self._paral_op_shape.keys()):
+                                  input2_shape = self._paral_op_shape[node.args[1]]
+                              else:
+                                  input2 = self._graph.node_table[node.args[1]]
+                                  input2_shape = list(input2.tensor_meta["shape"])
+                              norm_input1_shape, norm_input2_shape = self._normalize_binary_operator_shape(
+                                  input1_shape, input2_shape
+                              )
+                              for dim1, dim2 in zip(norm_input1_shape, norm_input2_shape):
+                                  broadcasted_result_shp.append(max(dim1, dim2))
+                          self._add_paral_op_shape(node.name, broadcasted_result_shp)
+                  del i
+              elif isinstance(node, ViewOp) and node != self.op_groups[subgraph_name][-1]:
+                  if node.args[0] in self._paral_op_shape.keys():
+                      old_shape = self._paral_op_shape[node.args[0]]
+                      old_len = len(old_shape)
+                      tmp_old_shape = []
+                      for i in range(old_len):
+                          if old_shape[i] != 1:
+                              tmp_old_shape.append(old_shape[i])
+                      new_shape = list(node.args[1])
+                      new_len = len(new_shape)
+                      tmp_new_shape = []
+                      for i in range(new_len):
+                          if new_shape[i] != 1:
+                              tmp_new_shape.append(new_shape[i])
+                      if len(tmp_old_shape) == len(tmp_new_shape):
+                          # todo: 待优化，当前处理方式只考虑<MxNx...> <--> <1xMxNx...>的情况
+                          if old_len < new_len:
+                              for i in range(old_len):
+                                  new_shape[i+1] = old_shape[i]
+                          else:
+                              for i in range(new_len):
+                                  new_shape[i] = old_shape[i+1]
+                      else:
+                          # todo: 待优化，当前处理方式只考虑<...xMxN> <--> <...xMxYxZ>(其中N=YxZ)的情况
+                          if old_len < new_len:
+                              new_shape[-2] = old_shape[-1] // new_shape[-1]
+                          else:
+                              new_shape = self._infer_new_shape_with_neg_one(old_shape, new_shape)
+                      self._add_paral_op_shape(node.name, new_shape)
+              elif isinstance(node, CatOp) and node != self.op_groups[subgraph_name][-1]:
+                  for op_arg in node.args[0]:
+                      op_arg = str(op_arg)
+                      if op_arg in self._paral_op_shape.keys():
+                          new_shape = self._paral_op_shape[op_arg]
+                          self._add_paral_op_shape(node.name, new_shape)
+                          break
+              else:
+                  if node != self.op_groups[subgraph_name][-1]:
+                  # 默认不属于上述的操作的算子算子都与被切分的算子的shape相同
+                      for i, op_arg in enumerate(node.args):
+                          if op_arg in self._paral_op_shape.keys():
+                              new_shape = self._paral_op_shape[op_arg]
+                              self._add_paral_op_shape(node.name, new_shape)
+                              break
+      
+      return subgraphs_inputs, subgraphs_outputs
     
+    # 根据分组信息构建子图，为每个子图创建一个新的Graph对象，并将相关的操作和输入输出节点添加到该子图中。
     def build_subgraph_by_group(self):
         """
         Builds subgraphs from a given graph based on groups.
@@ -373,7 +388,6 @@ def build_subgraph_by_group(self):
 
         # Construct each subgraph
         for subgraph_name in self.op_groups.keys():
-            # print(f"-----------------------subgraph{m}------------------------------")
             subgraph_input = []
             subgraph_body = {}
             # 设备信息
@@ -402,7 +416,7 @@ def build_subgraph_by_group(self):
                 # 与权重文件相关的操作指参数中包含权重矩阵或参数根据权重矩阵计算获得的操作
 
                 # ReshapeOp会改变shape,需要更新shape参数列表
-                if isinstance(op, ViewOp):
+                if isinstance(op, ViewOp) and self._parallelism > 1:
                     if op.args[0] in self._paral_op_shape.keys():
                         op._newshape = self._paral_op_shape[op.name]
                 subgraph_body[op.name] = op
@@ -431,7 +445,7 @@ def build_subgraph_by_group(self):
             subgraphs[subgraph_name] = subgraph
 
         return subgraphs
-
+    
     def topological_sort_subgraph(self):
         """
         Performs topological sorting on the subgraphs based on their dependencies.
@@ -464,6 +478,8 @@ def topological_sort_subgraph(self):
             else None
         )
 
+    # 构建主图，包含子图的调用和占位符操作，根据并行度修改各个参数的shape信息.
+    # 每个子图对应一个主图，主图中取权重参数并调用子图函数.
     def construct_main_graph(self, do_param_pack=False):
         """
         Constructs the main computational graph by incorporating subgraphs' call
@@ -480,26 +496,11 @@ def construct_main_graph(self, do_param_pack=False):
         implementation.
 
         """
-
-        # Analysis topology order to sort subgraph call.
-        topo_order = self.topological_sort_subgraph()
-        if topo_order == None:
-            print("Error : Graph Partitioning is illegal!")
-            return None
-
         # Analysis topology order to sort subgraph call.
         topo_order = self.topological_sort_subgraph()
         if topo_order == None:
             print("Error : Graph Partitioning is illegal!")
             return None
-        
-        # fake_params_offsets = []
-        # current_fake_param_offset = 0
-        # for tensorMeta in self._graph._fake_params:
-        #     fake_params_offsets.append(current_fake_param_offset)
-        #     current_fake_param_offset += functools.reduce(
-        #         lambda x, y: x * y, list(tensorMeta.shape), 1
-        #     )
 
         # 为每个子图创建一个FuncOp节点，并将这些节点添加到主图中。
         # Adding FuncOp nodes for each subgraph
@@ -509,13 +510,22 @@ def construct_main_graph(self, do_param_pack=False):
         for i, subgraph_name in enumerate(self._subgraphs.keys()):
             main_graph_name = "forward{}".format(i)
             current_param_info = {} # 存储参数索引和分割方式
-            main_graph = Graph(
-                [],
-                [],
+            if self._parallelism > 1:  
+              main_graph = Graph(
+                  [],
+                  [],
+                  self._graph._ops_registry,
+                  main_graph_name,
+                  self._graph._verbose,
+              )
+            else:
+              main_graph = Graph(
+                self._graph._inputs,
+                self._graph._fake_params,
                 self._graph._ops_registry,
-                main_graph_name,
+                self._graph._func_name,
                 self._graph._verbose,
-            )
+              )
             # 为每个子图创建一个FuncOp节点，并将这些节点添加到对应主图中。
             # FuncOp节点代表每个子图，用于主图对子图的调用
             func_node = FuncOp()
@@ -525,7 +535,7 @@ def construct_main_graph(self, do_param_pack=False):
                 func_node.add_argument(inp)
             
             outputs = self._subgraphs[subgraph_name]._outputs
-            if outputs is None:
+            if outputs is None or self._parallelism == 1:
                 for output in self._subgraphs_outputs[subgraph_name]:
                     func_node.tensor_meta["shape"].append(
                         self._graph.node_table[output.name].tensor_meta["shape"]
@@ -543,6 +553,7 @@ def construct_main_graph(self, do_param_pack=False):
                         self._graph.node_table[output.name].tensor_meta["dtype"]
                     )
             main_graph.add_node(func_node)
+            
             # Adding placeholder operations from the original graph
             ph_count : int = 0
             # 记录子图中是否有权重矩阵被分割
@@ -551,9 +562,9 @@ def construct_main_graph(self, do_param_pack=False):
             current_param_info["total_partitions"] = 1
             split_group.append(1)
             for node in self._graph.body:
-                if isinstance(node, PlaceholderOp) :
+                if isinstance(node, PlaceholderOp):
                     if node in self._subgraphs_inputs[subgraph_name]:
-                        if(len(self._graph._fake_params) > (ph_count)):
+                        if(len(self._graph._fake_params) > (ph_count) and self._parallelism > 1):
                             main_graph._fake_params.append(self._graph._fake_params[ph_count])
                             if node.name in self._paral_op_shape.keys():
                                 node._newshape = self._paral_op_shape[node.name]
@@ -567,6 +578,7 @@ def construct_main_graph(self, do_param_pack=False):
                                     {"index": ph_count, "split_degree": []}
                                 )
                         main_graph.add_node(node)
+
                     ph_count += 1
             param_size_group.append(self.get_pack_params_size(main_graph._fake_params))
 
@@ -575,24 +587,24 @@ def construct_main_graph(self, do_param_pack=False):
                 split_group[-1] = self._parallelism
             self._subgraph_param_info[subgraph_name] = current_param_info
 
-            # Identify inputs for each subgraph
             maingraph_input = inputs0
-            for node in self._subgraphs_inputs[subgraph_name]:
+            # Identify inputs for each subgraph
+            if self._parallelism > 1:
+              for node in self._subgraphs_inputs[subgraph_name]:
                 if (node.name not in main_graph.node_table.keys()):
-                    if node.name in self._paral_op_shape.keys():
-                        node_shape = self._paral_op_shape[node.name]
-                        # issplit = True
-                    else:
-                        node_shape = node.tensor_meta["shape"]
-                    node_dtype = node.tensor_meta["dtype"]
-                    input_tensor_meta = TensorMeta(node_shape, node_dtype)
-                    maingraph_input.append(input_tensor_meta)
-                    placeholder_node = PlaceholderOp()
-                    placeholder_node.name = node.name
-                    placeholder_node.tensor_meta = input_tensor_meta
-                    main_graph._body.append(placeholder_node)
-            # if issplit: 
-            #     current_param_info["total_partitions"] = self._parallelism
+                  if node.name in self._paral_op_shape.keys():
+                    node_shape = self._paral_op_shape[node.name]
+                    # issplit = True
+                  else:
+                    node_shape = node.tensor_meta["shape"]
+                  node_dtype = node.tensor_meta["dtype"]
+                  input_tensor_meta = TensorMeta(node_shape, node_dtype)
+                  maingraph_input.append(input_tensor_meta)
+                  placeholder_node = PlaceholderOp()
+                  placeholder_node.name = node.name
+                  placeholder_node.tensor_meta = input_tensor_meta
+                  main_graph._body.append(placeholder_node)
+
             # Adding CallOp to invoke the single subgraph
             call_node = CallOp()
             call_node.name = "call{}".format(i)
@@ -610,10 +622,10 @@ def construct_main_graph(self, do_param_pack=False):
                         )
                         break
             outputs = self._subgraphs[subgraph_name]._outputs
-            if outputs is None:
+            if outputs is None or self._parallelism == 1:
                 for output in self._subgraphs_outputs[subgraph_name]:
-                    call_node.tensor_meta["dtype"].append(
-                        self._graph.node_table[output.name].tensor_meta["dtype"]
+                    call_node.tensor_meta["shape"].append(
+                        self._graph.node_table[output.name].tensor_meta["shape"]
                     )
                     call_node.tensor_meta["dtype"].append(
                         self._graph.node_table[output.name].tensor_meta["dtype"]
@@ -654,6 +666,8 @@ def construct_main_graph(self, do_param_pack=False):
                     main_graph._ops_registry,
                     do_param_pack,
                 )
+                if self._parallelism == 1:
+                    return main_importer.import_main_graph()
                 self._modules[main_graph_name] = main_importer.import_main_graph()
                 inputs0 = []