diff --git a/README.md b/README.md
index 98dd9a8..329944a 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,58 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+[![](images/youtube.png)](https://www.youtube.com/watch?v=g5J5UmyLcXA)
 
-### (TODO: Your README)
+* Daniel Daley-Mongtomery
+* Tested on: MacBook Pro, OSX 10.12, i7 @ 2.3GHz, 16GB RAM, GT 750M 2048MB (Personal Machine)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Details
+
+  The [boids](https://en.wikipedia.org/wiki/Boids) algorithm by Craig Reynolds uses three simple rules to simulated flocking motions movement of virtual agents:
+  
+>**1. Cohesion** - Boids move towards the perceived center of mass of their neighbors
+>
+>**2. Separation** - Boids avoid getting to close to their neighbors
+>
+>**3. Alignment** - Boids generally try to move with the same direction and speed as their neighbors
+
+  Boids poll for these behaviors within a specified radius of influence, ignoring boids outside of that radius. This makes for an embarrassingly parallel system and a perfect opportunity to try out Nvidia's CUDA API and simulate as many birds as possible.
+  
+##### Basic Implementation
+
+  At the most naive level we can perform a doubly-nested loop, like so:
+  ```
+  for every boid i in parallel {
+    for every bird j {
+      if (distance(i,j) < radius) i.flockWith(j)
+    }
+  }
+  ```
+  
+  While this earns us some parallelism, we're likely performing thousands to hundreds of thousands of unecessary distance checks *for every boid*, and *on every step*. Instead, we'd like to know specifically where to look for nearby boids.
+  
+##### Uniform Grid
+
+  Instead of having every boid perform a position classification, I had the sysytem itself perform one such classification at the beginning of every step. This O(n) step to classify and sort the boids beforehand required a DxDxD grid, where D represented the diameter of the boids' area of influence. This way, a boids had to consult at most 4 grid cells to instantly know all of its neigbors.
+  
+![a uniform grid in 2D](images/Boids%20Ugrid%20neighbor%20search%20shown.png)
+  
+  Rather than bucketing the boids in memory, however, we can speed things up by representing the grid in a buffer of indices, each of which represent the index of the first boid to be stored at the cell. From the index of cell A, until I reached the index stored for some cell B, all in between boids belong to cell A. Cells with no boids stored -1 and were ignored.
+  
+![buffers for generating a uniform grid using index sort](images/Boids%20Ugrids%20buffers%20naive.png)
+
+In order to traverse the boids in this way, their representative indices must be sorted by their corresponding cell. For this, I used [thrust](https://github.com/thrust/thrust).
+
+##### Coherent Grid
+
+  Since we're sorting boids anyway, why not speed up boid lookup by sorting their position and velocity vectors. This way, instead of recieving a handful of data indices and retrieving each destination, we can simply receive a handful of boid data directly. 
+  
+![buffers for generating a uniform grid using index sort](images/Boids%20Ugrids%20buffers%20data%20coherent.png)
+  
+  This level of indirection seems like a minor optimization, but had a huge impact on performance. Below is the result of each method for different numbers of boids:
+  
+![](images/BoidNum.png)
+
+  Being my first time using CUDA, the cost of memory lookup was a huge surprise to me. I imagine that by leveraging shared memory to preemptively send nearby information to boid kernels could further improve performance. An important note is that these figures represent something like a stabilized time; all methods performed better at first, but slowed down as birds left their initial uniform distribution.
+  
+  I performed some experiments into the effects of changing the influence radius of the boids, but doing so fundamentally changes the result of the simulation and so can not be reasonably compared.
diff --git a/images/BoidNum.png b/images/BoidNum.png
new file mode 100644
index 0000000..8256262
Binary files /dev/null and b/images/BoidNum.png differ
diff --git a/images/youtube.png b/images/youtube.png
new file mode 100644
index 0000000..3bc6d93
Binary files /dev/null and b/images/youtube.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index aaf0fbf..de9bde0 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -11,6 +11,8 @@
 #define imax( a, b ) ( ((a) > (b)) ? (a) : (b) )
 #endif
 
+//#define print 0
+
 #ifndef imin
 #define imin( a, b ) ( ((a) < (b)) ? (a) : (b) )
 #endif
@@ -41,13 +43,13 @@ void checkCUDAError(const char *msg, int line = -1) {
 
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
-#define rule1Distance 5.0f
-#define rule2Distance 3.0f
-#define rule3Distance 5.0f
+#define rule1Distance 1.0f
+#define rule2Distance 0.6f
+#define rule3Distance 1.0f
 
-#define rule1Scale 0.01f
-#define rule2Scale 0.1f
-#define rule3Scale 0.1f
+#define rule1Scale 0.1f
+#define rule2Scale 1.0f
+#define rule3Scale 1.0f
 
 #define maxSpeed 1.0f
 
@@ -85,6 +87,7 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_pingPongPos;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -145,6 +148,9 @@ void Boids::initSimulation(int N) {
   cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3));
   checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
 
+  cudaMalloc((void**)&dev_pingPongPos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
+
   cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
   checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
 
@@ -152,8 +158,7 @@ void Boids::initSimulation(int N) {
   checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
 
   // LOOK-1.2 - This is a typical CUDA kernel invocation.
-  kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects,
-    dev_pos, scene_scale);
+  kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects, dev_pos, scene_scale);
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
   // LOOK-2.1 computing grid params
@@ -168,7 +173,18 @@ void Boids::initSimulation(int N) {
   gridMinimum.y -= halfGridWidth;
   gridMinimum.z -= halfGridWidth;
 
-  // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  // DONE-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("failed to allocate indicies for uniform grid");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("failed to allocate grid index buffers");
+
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+ 
   cudaThreadSynchronize();
 }
 
@@ -210,8 +226,8 @@ __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float
 void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) {
   dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
 
-  kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, vbodptr_positions, scene_scale);
-  kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
+  kernCopyPositionsToVBO <<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_pos, vbodptr_positions, scene_scale);
+  kernCopyVelocitiesToVBO <<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
 
   checkCUDAErrorWithLine("copyBoidsToVBO failed!");
 
@@ -231,20 +247,64 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+	glm::vec3 perceived_center = glm::vec3(0.0f,0.0f,0.0f);
+	glm::vec3 flee_direction = glm::vec3(0.0f, 0.0f, 0.0f);
+	glm::vec3 perceived_vel = glm::vec3(0.0f, 0.0f, 0.0f);
+
+	glm::vec3 output;
+
+	float distance;
+	int rule1neighbors = 0;
+	int rule3neighbors = 0;
+
+	for (int i = 0; i < N; i++) {
+		if (i != iSelf) {
+			distance = glm::distance(pos[i], pos[iSelf]);
+
+			// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+			if (distance < rule1Distance) {
+				perceived_center += pos[i];
+				rule1neighbors++;
+			}
+			// Rule 2: boids try to stay a distance d away from each other
+			if (distance < rule2Distance) flee_direction -= (pos[i] - pos[iSelf]);
+			// Rule 3: boids try to match the speed of surrounding boids
+			if (distance < rule3Distance) {
+				perceived_vel += (vel[i]);
+				rule3neighbors++;
+			}
+		}
+	}
+
+	if (rule1neighbors > 0) {
+		perceived_center /= (rule1neighbors);
+		output += (perceived_center - pos[iSelf]) * rule1Scale;
+	}
+
+	if (rule3neighbors > 0) {
+		perceived_vel /= (rule3neighbors);
+		output += perceived_vel * rule3Scale;
+	}
+
+	return output + flee_direction * rule2Scale;
 }
 
 /**
-* TODO-1.2 implement basic flocking
+* DONE-1.2 implement basic flocking
 * For each of the `N` bodies, update its position based on its current velocity.
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) return;
   // Compute a new velocity based on pos and vel1
+	glm::vec3 thisVel = vel1[index] + computeVelocityChange(N, index, pos, vel1);
   // Clamp the speed
+	if (glm::length(thisVel) <= maxSpeed) vel2[index] = thisVel;
+	else vel2[index] = glm::normalize(thisVel) * maxSpeed;
   // Record the new velocity into vel2. Question: why NOT vel1?
+	//ANSWER FOR README we reference vel1 for a snapshot of that step; we don't want
+	//to consider velocities for step n+1 during step n
 }
 
 /**
@@ -285,10 +345,19 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
-    // TODO-2.1
+    // DONE-2.1
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) return;
+
+	//index represents this boid's current memory address in PAI, vel, and pos
+	indices[index] = index;
+
+	//grid index is the floored result of pos/gridWidth, will need to be divided again for remainder (octant check)
+	glm::ivec3 relativePos = (pos[index] - gridMin) * inverseCellWidth;
+	gridIndices[index] = gridIndex3Dto1D(relativePos[0], relativePos[1], relativePos[2], gridResolution);
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -302,10 +371,31 @@ __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
 
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   int *gridCellStartIndices, int *gridCellEndIndices) {
-  // TODO-2.1
+  // DONE-2.1
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) return;
+
+	int thisPGI = particleGridIndices[index];
+
+	if (index == 0) {
+		gridCellStartIndices[thisPGI] = index;
+		return;
+	}
+
+	if (index == N - 1) {
+		gridCellEndIndices[thisPGI] = index;
+	}
+
+	//if bird[index] != bird[index-1], then bird[index-1] is the last of its cell and bird[index] is the first of its cell
+	if (particleGridIndices[index - 1] != thisPGI) {
+		gridCellStartIndices[thisPGI] = index;
+		gridCellEndIndices[particleGridIndices[index - 1]] = index - 1;
+	}
+	
+	//if there are no boids in a cell, it should be untouched and remain -1
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -314,14 +404,91 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   int *gridCellStartIndices, int *gridCellEndIndices,
   int *particleArrayIndices,
   glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+  // DONE-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) return;
+
   // - Identify the grid cell that this particle is in
+	//If the quotient of position/gridwidth for any dimension is more than halfway
+	//to the next cell, we know it's in the far quadrant
+	glm::vec3 gridCellIndicesRaw = (pos[index] - gridMin) * inverseCellWidth;
+	glm::ivec3 gridCellIndices = gridCellIndicesRaw;
+	glm::vec3 gridCellOverflow = gridCellIndicesRaw - glm::vec3(gridCellIndices);
+
+	//determine if the overflow is > 0.5, but then make sure that the desired neighbor is legal
+	int xNeighbor = gridCellIndices.x + (gridCellOverflow.x > 0.5f ? 1 : -1);
+	if (xNeighbor < 0 || xNeighbor > gridResolution) xNeighbor = 0;
+	int yNeighbor = gridCellIndices.y + gridCellOverflow.y > 0.5f ? 1 : -1;
+	if (yNeighbor < 0 || yNeighbor > gridResolution) yNeighbor = 0;
+	int zNeighbor = gridCellIndices.z + gridCellOverflow.z > 0.5f ? 1 : -1;
+	if (zNeighbor < 0 || zNeighbor > gridResolution) zNeighbor = 0;
+
   // - Identify which cells may contain neighbors. This isn't always 8.
+	int neighbors = 0;
+	int neighborIndices[8];
+
+	//self is always legal
+	neighborIndices[neighbors++] = gridIndex3Dto1D(gridCellIndices.x, gridCellIndices.y, gridCellIndices.z, gridResolution);
+
+	if (xNeighbor) neighborIndices[neighbors++] = gridIndex3Dto1D(xNeighbor, gridCellIndices.y, gridCellIndices.z, gridResolution);
+	if (yNeighbor) neighborIndices[neighbors++] = gridIndex3Dto1D(gridCellIndices.x, yNeighbor, gridCellIndices.z, gridResolution);
+	if (zNeighbor) neighborIndices[neighbors++] = gridIndex3Dto1D(gridCellIndices.x, gridCellIndices.y, zNeighbor, gridResolution);;
+	if (xNeighbor && yNeighbor) neighborIndices[neighbors++] = gridIndex3Dto1D(xNeighbor, yNeighbor, gridCellIndices.z, gridResolution);
+	if (xNeighbor && zNeighbor) neighborIndices[neighbors++] = gridIndex3Dto1D(xNeighbor, gridCellIndices.y, zNeighbor, gridResolution);
+	if (yNeighbor && zNeighbor) neighborIndices[neighbors++] = gridIndex3Dto1D(gridCellIndices.x, yNeighbor, zNeighbor, gridResolution);
+	if (xNeighbor && yNeighbor && zNeighbor) neighborIndices[neighbors++] = gridIndex3Dto1D(xNeighbor, yNeighbor, zNeighbor, gridResolution);
+
+	glm::vec3 perceived_center = glm::vec3(0.0f, 0.0f, 0.0f);
+	glm::vec3 flee_direction = glm::vec3(0.0f, 0.0f, 0.0f);
+	glm::vec3 perceived_vel = glm::vec3(0.0f, 0.0f, 0.0f);
+
+	glm::vec3 output;
+
+	float distance;
+	int rule1neighbors = 0;
+	int rule3neighbors = 0;
+
   // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+	for (int i = 0; i < neighbors; i++) {
+		int gridCellIndex = neighborIndices[i];
+		for (int j = gridCellStartIndices[gridCellIndex]; j <= gridCellEndIndices[gridCellIndex] && j > -1; j++) {
+			// - Access each boid in the cell and compute velocity change from
+			//   the boids rules, if this boid is within the neighborhood distance.
+			int thisIndex = particleArrayIndices[j];
+			if (thisIndex != index) {
+				distance = glm::distance(pos[thisIndex], pos[index]);
+
+				// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+				if (distance < rule1Distance) {
+					perceived_center += pos[thisIndex];
+					rule1neighbors++;
+				}
+				// Rule 2: boids try to stay a distance d away from each other
+				if (distance < rule2Distance) flee_direction -= (pos[thisIndex] - pos[index]);
+				// Rule 3: boids try to match the speed of surrounding boids
+				if (distance < rule3Distance) {
+					perceived_vel += (vel1[thisIndex]);
+					rule3neighbors++;
+				}
+			}
+		}
+	}
+
+	if (rule1neighbors > 0) {
+		perceived_center /= (rule1neighbors);
+		output += (perceived_center - pos[index]) * rule1Scale;
+	}
+
+	if (rule3neighbors > 0) {
+		perceived_vel /= (rule3neighbors);
+		output += perceived_vel * rule3Scale;
+	}
+
+	output = vel1[index] + output + flee_direction * rule2Scale;
+	// Clamp the speed and put into vel2
+	if (glm::length(output) <= maxSpeed) vel2[index] = output;
+	else vel2[index] = glm::normalize(output) * maxSpeed;
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -329,18 +496,90 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   float inverseCellWidth, float cellWidth,
   int *gridCellStartIndices, int *gridCellEndIndices,
   glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
-  // except with one less level of indirection.
-  // This should expect gridCellStartIndices and gridCellEndIndices to refer
-  // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+	
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) return;
+
+	// - Identify the grid cell that this particle is in
+	//If the quotient of position/gridwidth for any dimension is more than halfway
+	//to the next cell, we know it's in the far quadrant
+	glm::vec3 gridCellIndicesRaw = (pos[index] - gridMin) * inverseCellWidth;
+	glm::ivec3 gridCellIndices = gridCellIndicesRaw;
+	glm::vec3 gridCellOverflow = gridCellIndicesRaw - glm::vec3(gridCellIndices);
+
+	//determine if the overflow is > 0.5, but then make sure that the desired neighbor is legal
+	int xNeighbor = gridCellIndices.x + (gridCellOverflow.x > 0.5f ? 1 : -1);
+	if (xNeighbor < 0 || xNeighbor > gridResolution) xNeighbor = 0;
+	int yNeighbor = gridCellIndices.y + gridCellOverflow.y > 0.5f ? 1 : -1;
+	if (yNeighbor < 0 || yNeighbor > gridResolution) yNeighbor = 0;
+	int zNeighbor = gridCellIndices.z + gridCellOverflow.z > 0.5f ? 1 : -1;
+	if (zNeighbor < 0 || zNeighbor > gridResolution) zNeighbor = 0;
+
+	// - Identify which cells may contain neighbors. This isn't always 8.
+	int neighbors = 0;
+	int neighborIndices[8];
+
+	//self is always legal
+	neighborIndices[neighbors++] = gridIndex3Dto1D(gridCellIndices.x, gridCellIndices.y, gridCellIndices.z, gridResolution);
+
+	if (xNeighbor) neighborIndices[neighbors++] = gridIndex3Dto1D(xNeighbor, gridCellIndices.y, gridCellIndices.z, gridResolution);
+	if (yNeighbor) neighborIndices[neighbors++] = gridIndex3Dto1D(gridCellIndices.x, yNeighbor, gridCellIndices.z, gridResolution);
+	if (zNeighbor) neighborIndices[neighbors++] = gridIndex3Dto1D(gridCellIndices.x, gridCellIndices.y, zNeighbor, gridResolution);;
+	if (xNeighbor && yNeighbor) neighborIndices[neighbors++] = gridIndex3Dto1D(xNeighbor, yNeighbor, gridCellIndices.z, gridResolution);
+	if (xNeighbor && zNeighbor) neighborIndices[neighbors++] = gridIndex3Dto1D(xNeighbor, gridCellIndices.y, zNeighbor, gridResolution);
+	if (yNeighbor && zNeighbor) neighborIndices[neighbors++] = gridIndex3Dto1D(gridCellIndices.x, yNeighbor, zNeighbor, gridResolution);
+	if (xNeighbor && yNeighbor && zNeighbor) neighborIndices[neighbors++] = gridIndex3Dto1D(xNeighbor, yNeighbor, zNeighbor, gridResolution);
+
+	glm::vec3 perceived_center = glm::vec3(0.0f, 0.0f, 0.0f);
+	glm::vec3 flee_direction = glm::vec3(0.0f, 0.0f, 0.0f);
+	glm::vec3 perceived_vel = glm::vec3(0.0f, 0.0f, 0.0f);
+
+	glm::vec3 output;
+
+	float distance;
+	int rule1neighbors = 0;
+	int rule3neighbors = 0;
+
+	// - For each cell, read the start/end indices in the boid pointer array.
+	for (int i = 0; i < neighbors; i++) {
+		int gridCellIndex = neighborIndices[i];
+		for (int j = gridCellStartIndices[gridCellIndex]; j <= gridCellEndIndices[gridCellIndex] && j > -1; j++) {
+			// - Access each boid in the cell and compute velocity change from
+			//   the boids rules, if this boid is within the neighborhood distance.
+			if (j != index) {
+				distance = glm::distance(pos[j], pos[index]);
+
+				// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+				if (distance < rule1Distance) {
+					perceived_center += pos[j];
+					rule1neighbors++;
+				}
+				// Rule 2: boids try to stay a distance d away from each other
+				if (distance < rule2Distance) flee_direction -= (pos[j] - pos[index]);
+				// Rule 3: boids try to match the speed of surrounding boids
+				if (distance < rule3Distance) {
+					perceived_vel += (vel1[j]);
+					rule3neighbors++;
+				}
+			}
+		}
+	}
+
+	if (rule1neighbors > 0) {
+		perceived_center /= (rule1neighbors);
+		output += (perceived_center - pos[index]) * rule1Scale;
+	}
+
+	if (rule3neighbors > 0) {
+		perceived_vel /= (rule3neighbors);
+		output += perceived_vel * rule3Scale;
+	}
+
+	output = vel1[index] + output + flee_direction * rule2Scale;
+	// Clamp the speed and put into vel2
+	if (glm::length(output) <= maxSpeed) vel2[index] = output;
+	else vel2[index] = glm::normalize(output) * maxSpeed;
+
 }
 
 /**
@@ -348,40 +587,125 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	kernUpdateVelocityBruteForce <<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("Brute force calls failed!");
+
+	kernUpdatePos <<<fullBlocksPerGrid, blockSize>>> (numObjects, dt, dev_pos, dev_vel1);
+	checkCUDAErrorWithLine("Position Update calls failed!");
+
   // TODO-1.2 ping-pong the velocity buffers
+	glm::vec3* temp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp;
+
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
-  // TODO-2.1
+
+  // DONE-2.1
   // Uniform Grid Neighbor search using Thrust sort.
+
+	dim3 blocksToMakeN((numObjects + blockSize - 1) / blockSize); //number of blocks needed to iterate over numObjects
+	dim3 blocksToMakeCells((gridCellCount + blockSize - 1) / blockSize); //number of blocks to iterate over gridCellCount
+
+	//reset startend arrays, where -1 means that no boids are in that cell
+	kernResetIntBuffer <<<blocksToMakeCells, blockSize>>> (gridCellCount, dev_gridCellStartIndices, -1);
+	kernResetIntBuffer <<<blocksToMakeCells, blockSize>>> (gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("Failed to Reset Arrays to -1");
+
   // In Parallel:
   // - label each particle with its array index as well as its grid index.
   //   Use 2x width grids.
+	kernComputeIndices <<<blocksToMakeN, blockSize>>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("Failed to Compute Grid Indicies");
+
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+	checkCUDAErrorWithLine("Failed to Sort Grid Indicies");
+
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+	kernIdentifyCellStartEnd<<<blocksToMakeN, blockSize>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("Failed to Determine Start/End points");
+
   // - Perform velocity updates using neighbor search
+	kernUpdateVelNeighborSearchScattered <<<blocksToMakeN, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("Failed to comput velocity (scattered)");
+
   // - Update positions
+	kernUpdatePos<<<blocksToMakeN,blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+	checkCUDAErrorWithLine("Failed to Update Positions (scattered)");
+
   // - Ping-pong buffers as needed
+	glm::vec3* temp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp;
+}
+
+//helper for coherent step
+__global__ void kernReorderForCoherent(int N, glm::vec3* pos, glm::vec3* pingPongPos, glm::vec3* vel1, glm::vec3* vel2, int* arrayIndices) {
+	//we'll ping pong both our vel and pos to newly ordered ones. We'll switch them to the expected addresses after this in the Step
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) return;
+
+	pingPongPos[index] = pos[arrayIndices[index]];
+	vel2[index] = vel1[arrayIndices[index]];
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
-  // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
-  // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
-  // In Parallel:
-  // - Label each particle with its array index as well as its grid index.
-  //   Use 2x width grids
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
-  //   the particle data in the simulation array.
-  //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+	dim3 blocksToMakeN((numObjects + blockSize - 1) / blockSize); //number of blocks needed to iterate over numObjects
+	dim3 blocksToMakeCells((gridCellCount + blockSize - 1) / blockSize); //number of blocks to iterate over gridCellCount
+
+																		 //reset startend arrays, where -1 means that no boids are in that cell
+	kernResetIntBuffer << <blocksToMakeCells, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+	kernResetIntBuffer << <blocksToMakeCells, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("Failed to Reset Arrays to -1");
+
+	// In Parallel:
+	// - label each particle with its array index as well as its grid index.
+	//   Use 2x width grids.
+	kernComputeIndices << <blocksToMakeN, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("Failed to Compute Grid Indicies");
+
+	// - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+	//   are welcome to do a performance comparison.
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+	checkCUDAErrorWithLine("Failed to Sort Grid Indicies");
+
+	// - Naively unroll the loop for finding the start and end indices of each
+	//   cell's data pointers in the array of boid indices
+	kernIdentifyCellStartEnd<<<blocksToMakeN, blockSize>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("Failed to Determine Start/End points");
+
+	kernReorderForCoherent<<<blocksToMakeN,blockSize>>>(numObjects, dev_pos, dev_pingPongPos, dev_vel1, dev_vel2, dev_particleArrayIndices);
+	checkCUDAErrorWithLine("Failed to reorder boid data");
+
+	//at this point, pos contains incorrectly ordered data. We'll give that to pos
+	glm::vec3* temp = dev_pos;
+	dev_pos = dev_pingPongPos;
+	dev_pingPongPos = temp;
+
+	//likwise, vel2 has the good stuff, but we're used to vel1 having it.
+	temp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp;
+
+	//this is redundant, we could accommodate these new locations... But for intelligibility, we'll exchange these extra 4 pointers every frame.
+	
+	// - Perform velocity updates using neighbor search
+	kernUpdateVelNeighborSearchScattered << <blocksToMakeN, blockSize >> >(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("Failed to comput velocity (scattered)");
+
+	// - Update positions
+	kernUpdatePos << <blocksToMakeN, blockSize >> >(numObjects, dt, dev_pos, dev_vel2);
+	checkCUDAErrorWithLine("Failed to Update Positions (scattered)");
+
+	// - Ping-pong buffers as needed
+	temp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp;
 }
 
 void Boids::endSimulation() {
@@ -389,71 +713,10 @@ void Boids::endSimulation() {
   cudaFree(dev_vel2);
   cudaFree(dev_pos);
 
-  // TODO-2.1 TODO-2.3 - Free any additional buffers here.
-}
-
-void Boids::unitTest() {
-  // LOOK-1.2 Feel free to write additional tests here.
-
-  // test unstable sort
-  int *dev_intKeys;
-  int *dev_intValues;
-  int N = 10;
-
-  int *intKeys = new int[N];
-  int *intValues = new int[N];
-
-  intKeys[0] = 0; intValues[0] = 0;
-  intKeys[1] = 1; intValues[1] = 1;
-  intKeys[2] = 0; intValues[2] = 2;
-  intKeys[3] = 3; intValues[3] = 3;
-  intKeys[4] = 0; intValues[4] = 4;
-  intKeys[5] = 2; intValues[5] = 5;
-  intKeys[6] = 2; intValues[6] = 6;
-  intKeys[7] = 0; intValues[7] = 7;
-  intKeys[8] = 5; intValues[8] = 8;
-  intKeys[9] = 6; intValues[9] = 9;
-
-  cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
-
-  cudaMalloc((void**)&dev_intValues, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
-
-  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-
-  std::cout << "before unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
-
-  // How to copy data to the GPU
-  cudaMemcpy(dev_intKeys, intKeys, sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(dev_intValues, intValues, sizeof(int) * N, cudaMemcpyHostToDevice);
-
-  // Wrap device vectors in thrust iterators for use with thrust.
-  thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
-  thrust::device_ptr<int> dev_thrust_values(dev_intValues);
-  // LOOK-2.1 Example for using thrust::sort_by_key
-  thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
-
-  // How to copy data back to the CPU side from the GPU
-  cudaMemcpy(intKeys, dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  cudaMemcpy(intValues, dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  checkCUDAErrorWithLine("memcpy back failed!");
-
-  std::cout << "after unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
-
-  // cleanup
-  delete[] intKeys;
-  delete[] intValues;
-  cudaFree(dev_intKeys);
-  cudaFree(dev_intValues);
-  checkCUDAErrorWithLine("cudaFree failed!");
-  return;
+  cudaFree(dev_pingPongPos);
+  
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
 }
diff --git a/src/kernel.h b/src/kernel.h
index 3d3da72..aa857b2 100644
--- a/src/kernel.h
+++ b/src/kernel.h
@@ -17,5 +17,4 @@ namespace Boids {
     void copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities);
 
     void endSimulation();
-    void unitTest();
 }
diff --git a/src/main.cpp b/src/main.cpp
index a29471d..3593eda 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 500000;
 const float DT = 0.2f;
 
 /**
@@ -220,9 +220,6 @@ void initShaders(GLuint * program) {
     double timebase = 0;
     int frame = 0;
 
-    Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
-                       // your CUDA development setup is ready to go.
-
     while (!glfwWindowShouldClose(window)) {
       glfwPollEvents();
 
diff --git a/src/main.hpp b/src/main.hpp
index 6cdaa93..821c23a 100644
--- a/src/main.hpp
+++ b/src/main.hpp
@@ -36,9 +36,9 @@ const float fovy = (float) (PI / 4);
 const float zNear = 0.10f;
 const float zFar = 10.0f;
 // LOOK-1.2: for high DPI displays, you may want to double these settings.
-int width = 1280;
-int height = 720;
-int pointSize = 2.0f;
+int width = 2560;
+int height = 1440;
+int pointSize = 3.0f;
 
 // For camera controls
 bool leftMousePressed = false;