diff --git a/CMakeLists.txt b/CMakeLists.txt
index 88a6775..9a29c61 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,9 +1,27 @@
 cmake_minimum_required(VERSION 3.1...3.25)
 
-project(
-  gs_patterns
-  VERSION 1.0
-  LANGUAGES C)
+project( gs_patterns VERSION 1.0 LANGUAGES CXX)
 
-add_executable(gs_patterns gs_patterns.c)
-set (CMAKE_C_FLAGS "-lz -lm")
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED On)
+#set(CMAKE_CXX_EXTENSIONS Off)
+
+add_library(gs_patterns_core SHARED
+        utils.h
+        utils.cpp
+        gs_patterns.h
+        gs_patterns_core.h
+        gs_patterns_core.cpp
+        gsnv_patterns.h
+        gsnv_patterns.cpp
+        gspin_patterns.h
+        gspin_patterns.cpp
+)
+
+add_executable( gs_patterns
+        gs_patterns_main.cpp
+)
+
+target_link_libraries(gs_patterns gs_patterns_core)
+
+set(CMAKE_CXX_STANDARD_LIBRARIES "-lm -lz ${CMAKE_CXX_STANDARD_LIBRARIES}")
diff --git a/README.md b/README.md
index a91bab0..8bfdbee 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,10 @@
 # Description
-Memory analysis tool for finding nontrivial gather / scatter (g/s) accesses from DynamoRio formatted traces. gs_patterns doesn't just look for explicit g/s instructions, but also all other scalar accesses in loops. gs_patterns writes the subtraces to binary traces and a spatter yaml formatted file. The source lines of the top aggressors are reported. Use the provided pin clients in the pin_tracing folder or use DynamoRio. Pin tends to be more reliable for larger applications.
+Memory analysis tool for finding nontrivial gather / scatter (g/s) accesses from DynamoRio & NVBit traces.  gs_patterns writes the subtraces to binary traces and a spatter yaml formatted file. The source lines of the top aggressors are reported. Use the provided pin clients in the pin_tracing folder or use DynamoRio. Pin tends to be more reliable for larger applications.
+
+For CUDA kernels use the provided nvbit client in the nvbit_tracing folder. 
+
+See the README in the respective folders for more detailed information on these tools.
+
 
 # Build
 ```
@@ -10,10 +15,21 @@ make
 ```
 
 # Use
+
+## For Pin/DynamoRio
+```
+gs_pattern <pin_trace.gz> <binary>
 ```
-gs_pattern <trace.gz> <binary>
+
+## For NVBit (CUDA Kernels) 
+
+```
+gs_pattern <nvbit_trace.gz> -nv 
 ```
-trace file should be gzipped (not tar + gz). Binary file should be compiled with symbols turned on (-g)
+
+Trace file should be gzipped (not tar + gz). For Pin or DynamoRio, binary file should be compiled with symbols turned on (-g).
+
+For NVBit tracing the kernel must be compiled with line numbers (--generate-line-info).  Please see nvbit_tracing/README.md for detailed information on how to extract traces for CUDA kernels which are compatible with gs_patterns.
 
 # How gs_patterns works
 g/s accesses are found by looking at repeated instruction addresses (loops) that are memory instructions (scalar and vector). The first pass finds the top g/s's and filters out instructions with trivial access patterns. The second pass focuses on the top g/s accesses and records the normalized address array indices to a binary file and spatter yaml file.
@@ -21,5 +37,6 @@ g/s accesses are found by looking at repeated instruction addresses (loops) that
 # License
 BSD-3 License. See [the LICENSE file](https://github.com/lanl/gs_patterns/blob/main/LICENSE).
   
-# Author
+# Authors
 Kevin Sheridan, <kss@lanl.gov>
+Christopher Scott, <christopher.scott@gatech.edu>
diff --git a/gs_patterns.c b/gs_patterns.c
deleted file mode 100644
index 37d9225..0000000
--- a/gs_patterns.c
+++ /dev/null
@@ -1,1160 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <string.h>
-#include <assert.h>
-#include <math.h>
-#include <ctype.h>
-#include <zlib.h>
-#include <sys/resource.h>
-
-//symbol lookup options
-#define SYMBOLS_ONLY 1 //Filter out instructions that have no symbol
-
-//Printing
-#define PERSAMPLE 10000000
-
-//info
-#define CLSIZE (64) //cacheline bytes
-#define VBITS (512) //vector bits
-#define NBUFS (1LL<<10) //trace reading buffer size
-#define IWINDOW (1024) //number of iaddrs per window
-#define NGS (8096) //max number for gathers and scatters
-#define OBOUNDS (512) //histogram positive max
-#define OBOUNDS_ALLOC (2*OBOUNDS + 3)
-
-//"patterns"
-#define USTRIDES 1024   //Filter threshold for number of accesses
-#define NSTRIDES 5      //Filter threshold for number of unique distances
-#define OUTTHRESH (0.5) //Filter threshold for percentage of distances at boundaries of histogram
-#define NTOP (10)       //Final gather / scatters to keep
-#define PSIZE (1<<23)   //Max number of indices recorded per gather/scatter
-
-//DONT CHANGE
-#define VBYTES (VBITS/8)
-
-typedef uintptr_t addr_t;
-
-//FROM DR SOURCE
-//DR trace
-struct _trace_entry_t {
-  unsigned short type; // 2 bytes: trace_type_t
-  unsigned short size;
-  union {
-    addr_t addr; 
-    unsigned char length[sizeof(addr_t)];
-  };
-}  __attribute__((packed));
-typedef struct _trace_entry_t trace_entry_t;
-
-static inline int popcount(uint64_t x) {
-    int c;
-    
-    for (c = 0; x != 0; x >>= 1)
-      if (x & 1)
-	c++;
-    return c;
-}
-
-//string tools
-int startswith(const char *a, const char *b) {
-  if(strncmp(b, a, strlen(b)) == 0)
-    return 1;
-  return 0;
-}
-
-int endswith(const char *a, const char *b) {
-  int idx = strlen(a);
-  int preidx = strlen(b);
-
-  if (preidx >= idx)
-    return 0;
-  if(strncmp(b, &a[idx-preidx], preidx) == 0)
-    return 1;
-  return 0;
-}
-
-//https://stackoverflow.com/questions/779875/what-function-is-to-replace-a-substring-from-a-string-in-c
-char *str_replace(char *orig, char *rep, char *with) {
-    char *result; // the return string
-    char *ins;    // the next insert point
-    char *tmp;    // varies
-    int len_rep;  // length of rep (the string to remove)
-    int len_with; // length of with (the string to replace rep with)
-    int len_front; // distance between rep and end of last rep
-    int count;    // number of replacements
-
-    // sanity checks and initialization
-    if (!orig)
-        return NULL;
-    
-    if (!rep)
-      return orig;
-    
-    len_rep = strlen(rep);
-    if (len_rep == 0)
-        return NULL; // empty rep causes infinite loop during count
-    if (!with)
-        with = "";
-    len_with = strlen(with);
-
-    // count the number of replacements needed
-    ins = orig;
-    for (count = 0; tmp = strstr(ins, rep); ++count) {
-        ins = tmp + len_rep;
-    }
-
-    tmp = result = malloc(strlen(orig) + (len_with - len_rep) * count + 1);
-
-    if (!result)
-        return NULL;
-
-    while (count--) {
-        ins = strstr(orig, rep);
-        len_front = ins - orig;
-        tmp = strncpy(tmp, orig, len_front) + len_front;
-        tmp = strcpy(tmp, with) + len_with;
-        orig += len_front + len_rep; // move to next "end of rep"
-    }
-    strcpy(tmp, orig);
-    return result;
-}
-
-char * get_str(char * line, char * bparse, char * aparse) {
-
-  char * sline;
-
-  sline = str_replace(line, bparse, "");
-  sline = str_replace(sline, aparse, "");
-
-  return sline;
-}
-
-int cnt_str(char * line, char c) {
-
-  int cnt = 0;
-  for(int i=0; line[i] != '\0'; i++){
-    if (line[i] == c)
-      cnt++;
-  }
-
-  return cnt;
-}
-  
-void translate_iaddr(char * binary, char * source_line, addr_t iaddr) {
-
-  int i = 0;
-  int ntranslated = 0;
-  char path[1024];
-  char cmd[1024];
-  FILE *fp;
-    
-  sprintf(cmd, "addr2line -e %s 0x%lx", binary, iaddr);
-    
-  /* Open the command for reading. */
-  fp = popen(cmd, "r");
-  if (fp == NULL) {
-    printf("Failed to run command\n" );
-    exit(1);
-  }
-  
-  /* Read the output a line at a time - output it. */
-  while (fgets(path, sizeof(path), fp) != NULL) {
-    strcpy(source_line, path);
-    source_line[strcspn(source_line, "\n")] = 0;
-  }
-  
-  /* close */
-  pclose(fp);
-  
-  return;
-}
-
-int drline_read(gzFile fp, trace_entry_t * val, trace_entry_t ** p_val, int * edx) {
-
-  int idx;
-
-  idx = (*edx)/sizeof(trace_entry_t);
-  //first read
-  if (*p_val == NULL)  {
-    *edx = gzread(fp, val, sizeof(trace_entry_t)*NBUFS);
-    *p_val = val;
-      
-  } else if (*p_val == &val[idx]) {
-    *edx = gzread(fp, val, sizeof(trace_entry_t)*NBUFS);
-    *p_val = val;    
-  }
-  
-  if (*edx == 0) 
-    return 0;
-  
-  return 1;
-}
-
-int main(int argc, char ** argv) {
-
-  //generic
-  int i, j, k, m, n, w;
-  int iwindow = 0;
-  int iret = 0;
-  int ret;
-  int did_opcode = 0;
-  int windowfull = 0;
-  //int byte;
-  int do_gs_traces = 0;
-  int do_filter = 1;
-  int64_t ngs = 0;  
-  char *eptr;
-  char binary[1024];
-  char srcline[1024];
-  
-  //dtrace vars
-  int64_t drtrace_lines = 0;
-  trace_entry_t * drline;
-  trace_entry_t * drline2;
-  trace_entry_t * p_drtrace = NULL; 
-  static trace_entry_t drtrace[NBUFS];
-  gzFile fp_drtrace;
-  FILE * fp_gs;
-
-  //metrics
-  int gs;
-  uint64_t opcodes = 0;
-  uint64_t opcodes_mem = 0;
-  uint64_t addrs = 0;
-  uint64_t other = 0;
-  int64_t maddr_prev;
-  int64_t maddr;
-  int64_t mcl;
-  int64_t giaddrs_nosym = 0;
-  int64_t siaddrs_nosym = 0;
-  int64_t gindices_nosym = 0;
-  int64_t sindices_nosym = 0; 
-  int64_t giaddrs_sym = 0;
-  int64_t siaddrs_sym = 0;
-  int64_t gindices_sym = 0;
-  int64_t sindices_sym = 0;  
-  int64_t gather_bytes_hist[100] = {0};
-  int64_t scatter_bytes_hist[100] = {0};
-  double gather_cnt = 0.0;
-  double scatter_cnt = 0.0;
-  double other_cnt = 0.0;
-  double gather_score = 0.0;
-  double gather_occ_avg = 0.0;
-  double scatter_occ_avg = 0.0;
-
-  //windows
-  int w_rw_idx;
-  int w_idx;
-  addr_t iaddr;
-  static int64_t w_iaddrs[2][IWINDOW];
-  static int64_t w_bytes[2][IWINDOW];
-  static int64_t w_maddr[2][IWINDOW][VBYTES];
-  static int64_t w_cnt[2][IWINDOW];
-
-  //First pass to find top gather / scatters
-  static char gather_srcline[NGS][1024];
-  static addr_t gather_iaddrs[NGS] = {0};
-  static int64_t gather_icnt[NGS] = {0}; //vector instances
-  static int64_t gather_occ[NGS] = {0}; //load instances
-  static char scatter_srcline[NGS][1024]; //src line string
-  static addr_t scatter_iaddrs[NGS] = {0};
-  static int64_t scatter_icnt[NGS] = {0};
-  static int64_t scatter_occ[NGS] = {0};
-
-  //Second Pass
-  int dotrace;
-  int bestcnt;
-  int bestidx;
-  int gather_ntop = 0;
-  int scatter_ntop = 0;
-  static int gather_offset[NTOP] = {0};
-  static int scatter_offset[NTOP] = {0};
-  
-  static addr_t best_iaddr;
-  static addr_t gather_tot[NTOP] = {0};
-  static addr_t scatter_tot[NTOP] = {0};
-  static addr_t gather_top[NTOP] = {0};
-  static addr_t gather_top_idx[NTOP] = {0};
-  static addr_t scatter_top[NTOP] = {0};
-  static addr_t scatter_top_idx[NTOP] = {0};
-  static addr_t gather_base[NTOP] = {0};
-  static addr_t scatter_base[NTOP] = {0};
-  static addr_t gather_size[NTOP] = {0};
-  static addr_t scatter_size[NTOP] = {0};
-  static int64_t * gather_patterns[NTOP] = {0};
-  static int64_t * scatter_patterns[NTOP] = {0};
-
-  for(j=0; j<NTOP; j++) {
-    gather_patterns[j] = (int64_t *) calloc(PSIZE, sizeof(int64_t));
-    if (gather_patterns[j] == NULL) {
-      printf("ERROR: Could not allocate gather_patterns!\n");
-      exit(-1);
-    }
-  }
-  
-  for(j=0; j<NTOP; j++) {
-    scatter_patterns[j] = (int64_t *) calloc(PSIZE, sizeof(int64_t));
-    if (scatter_patterns[j] == NULL) {
-      printf("ERROR: Could not allocate scatter_patterns!\n");
-      exit(-1);
-    }
-  }
-  
-  if (argc == 3) {
-        
-    // 1 open dr trace
-    fp_drtrace = gzopen(argv[1], "hrb");
-    if (fp_drtrace == NULL) {
-      printf("ERROR: Could not open %s!\n", argv[1]);
-      exit(-1);
-    }
-
-    strcpy(binary, argv[2]);
-    
-  } else {
-    printf("ERROR: Invalid arguments, should be: trace.gz binary\n");
-    exit(-1);	
-  }
-
-  //init window arrays
-  for(w=0; w<2; w++) {
-    for (i = 0; i < IWINDOW; i++) {
-      w_iaddrs[w][i] = -1;
-      w_bytes[w][i] = 0;
-      w_cnt[w][i] = 0;
-      for (j=0; j<VBYTES; j++)
-	w_maddr[w][i][j] = -1;
-    }
-  }
-
-  int did_record = 0;
-  uint64_t mcnt = 0;
-  uint64_t unique_iaddrs = 0;
-  int unsynced = 0;
-  uint64_t unsync_cnt = 0;
-  addr_t ciaddr;
-
-  printf("First pass to find top gather / scatter iaddresses\n"); fflush(stdout);
-	 
-  //read dr trace entries instrs
-  //printf("%16s %16s %16s %16s %16s %16s\n", "iaddr", "rw", "byte", "bytes", "cnt", "maddr");
-  while ( drline_read(fp_drtrace, drtrace, &p_drtrace, &iret) ) {
-    
-    //decode drtrace
-    drline = p_drtrace;
-    
-    /*****************************/
-    /** INSTR 0xa-0x10 and 0x1e **/
-    /*****************************/
-    if ( ((drline->type >= 0xa) && (drline->type <= 0x10)) || (drline->type == 0x1e) ) {
-      
-      //iaddr
-      iaddr = drline->addr;
-      
-      //nops
-      opcodes++;
-      did_opcode = 1;
-      
-      /***********************/
-      /** MEM 0x00 and 0x01 **/
-      /***********************/
-    } else if ( (drline->type == 0x0) || (drline->type == 0x1) ) {
-
-      w_rw_idx = drline->type;
-
-      //printf("M DRTRACE -- iaddr: %016lx addr: %016lx cl_start: %d bytes: %d\n",
-      //     iaddr,  drline->addr, drline->addr % 64, drline->size);
-
-      if ((++mcnt % PERSAMPLE) == 0) {
-	printf(".");
-	fflush(stdout);
-      }
-      
-      //is iaddr in window
-      w_idx = -1;
-      for (i=0; i<IWINDOW; i++) {
-	
-	//new iaddr
-	if (w_iaddrs[w_rw_idx][i] == -1) {
-	  w_idx = i;
-	  break;
-	  
-	  //iaddr exists
-	} else if (w_iaddrs[w_rw_idx][i] == iaddr) {
-	  w_idx = i;
-	  break	;  
-	}
-      }
-      
-      //new window
-      if ( (w_idx == -1) || (w_bytes[w_rw_idx][w_idx] >= VBYTES) ||
-	   (w_cnt[w_rw_idx][w_idx] >= VBYTES) ) {
-	
-	/***************************/
-	//do analysis
-	/***************************/
-	//i = each window
-	for(w=0; w<2; w++) {
-	  
-	  for (i=0; i<IWINDOW; i++) {
-	  
-	    if (w_iaddrs[w][i] == -1)
-	      break;
-	  
-	    //byte = w_bytes[w][i] / w_cnt[w][i];
-	  
-	    //First pass
-	    //Determine
-	    //gather/scatter?
-	    gs = -1;
-	    for (j=0; j<w_cnt[w][i]; j++) {
-
-	      //address and cl
-	      maddr = w_maddr[w][i][j];
-	      assert(maddr > -1);
-	      
-	      //previous addr
-	      if (j==0)
-		maddr_prev = maddr - 1;
-	      
-	      //gather / scatter
-	      if ( maddr != maddr_prev) {
-		if ( (gs == -1) && (abs(maddr - maddr_prev) > 1) )
-		  gs = w;
-	      }	      
-	      maddr_prev = maddr;
-	    }
-	    	      
-	    if (gs == -1) {
-
-	      //check if this was a gather
-	      if (w == 0) {
-		
-		for(k=0; k<NGS; k++) {
-		  
-		  //end
-		  if (gather_iaddrs[k] == 0)
-		    break;
-		  
-		  if (gather_iaddrs[k] == w_iaddrs[w][i]) {
-		    gs = 0;
-		    break;
-		  }
-		}
-		  
-		//check if this was a scatter
-	      } else {
-		
-		for(k=0; k<NGS; k++) {
-		  
-		  //end
-		  if (scatter_iaddrs[k] == 0)
-		    break;
-		  
-		  if (scatter_iaddrs[k] == w_iaddrs[w][i]) {
-		    gs = 1;
-		    break;
-		  }
-		}		  
-	      }
-	    }
-
-	    if (gs == -1)
-	      other_cnt++;
-
-	    did_record = 0;
-	    if (gs == 0) {
-	      
-	      gather_occ_avg += w_cnt[w][i];
-	      gather_cnt += 1.0;
-
-	      for(k=0; k<NGS; k++) {
-		if (gather_iaddrs[k] == 0) {
-		  gather_iaddrs[k] = w_iaddrs[w][i];
-		  gather_icnt[k]++;
-		  gather_occ[k] += w_cnt[w][i];
-		  did_record = 1;
-		  break;
-		}
-		
-		if (gather_iaddrs[k] == w_iaddrs[w][i]) {
-		  gather_icnt[k]++;
-		  gather_occ[k] += w_cnt[w][i];
-		  did_record = 1;
-		  break;
-		}
-		
-	      }
-	      assert(did_record == 1);
-	      
-	    } else if (gs == 1) {
-	      
-	      scatter_occ_avg += w_cnt[w][i];
-	      scatter_cnt += 1.0;
-	      
-	      for(k=0; k<NGS; k++) {
-		if (scatter_iaddrs[k] == 0) {
-		  scatter_iaddrs[k] = w_iaddrs[w][i];
-		  scatter_icnt[k]++;
-		  scatter_occ[k] += w_cnt[w][i];
-		  did_record = 1;
-		  break;
-		}
-		
-		if (scatter_iaddrs[k] == w_iaddrs[w][i]) {
-		  scatter_icnt[k]++;
-		  scatter_occ[k] += w_cnt[w][i];
-		  did_record = 1;
-		  break;
-		}		
-	      }
-
-	      assert(did_record == 1);
-	    }
-	  } //WINDOW i
-	
-	  w_idx = 0;
-	  
-	  //reset windows
-	  for (i = 0; i < IWINDOW; i++) {
-	    w_iaddrs[w][i] = -1;
-	    w_bytes[w][i] = 0;
-	    w_cnt[w][i] = 0;
-	    for (j=0; j<VBYTES; j++)
-	      w_maddr[w][i][j] = -1;	
-	  }	  
-	} // rw w
-      } //analysis
-            
-      //Set window values
-      w_iaddrs[w_rw_idx][w_idx] = iaddr;
-      w_maddr[w_rw_idx][w_idx][w_cnt[w_rw_idx][w_idx]] = drline->addr / drline->size;
-      w_bytes[w_rw_idx][w_idx] += drline->size;    
-      
-      //num access per iaddr in loop
-      w_cnt[w_rw_idx][w_idx]++;      
-      
-      if (did_opcode) {	
-	
-	opcodes_mem++;
-	addrs++;
-	did_opcode = 0;
-	
-      } else {
-	addrs++;
-      }
-      
-      /***********************/
-      /** SOMETHING ELSE **/
-      /***********************/
-    } else {
-      other++;
-    }
-    
-    p_drtrace++;
-    drtrace_lines++;
-    
-  } //while drtrace
-  
-  //metrics
-  gather_occ_avg /= gather_cnt;
-  scatter_occ_avg /= scatter_cnt;
-
-  printf("\n RESULTS \n");
-  
-  //close files
-  gzclose(fp_drtrace);
-  
-  printf("DRTRACE STATS\n");
-  printf("DRTRACE LINES:        %16lu\n", drtrace_lines);
-  printf("OPCODES:              %16lu\n", opcodes);
-  printf("MEMOPCODES:           %16lu\n", opcodes_mem);
-  printf("LOAD/STORES:          %16lu\n", addrs);
-  printf("OTHER:                %16lu\n", other);
-  
-  printf("\n");
-  
-  printf("FIRST PASS GATHER/SCATTER STATS: \n");
-  printf("LOADS per GATHER:     %16.3f\n", gather_occ_avg);
-  printf("STORES per SCATTER:   %16.3f\n", scatter_occ_avg);
-  printf("GATHER COUNT:         %16.3f (log2)\n", log(gather_cnt) / log(2.0));
-  printf("SCATTER COUNT:        %16.3f (log2)\n", log(scatter_cnt) / log(2.0));
-  printf("OTHER  COUNT:         %16.3f (log2)\n", log(other_cnt) / log(2.0));
-  
-  //Find source lines 
-
-  //Must have symbol
-  printf("\nSymbol table lookup for gathers..."); fflush(stdout);
-  gather_cnt = 0.0;
-  for(k=0; k<NGS; k++) {
-    
-    if (gather_iaddrs[k] == 0)
-      break;
-    
-    translate_iaddr(binary, gather_srcline[k], gather_iaddrs[k]);
-    
-#if SYMBOLS_ONLY
-    if (startswith(gather_srcline[k], "?")) {
-      gather_icnt[k] = 0;
-      giaddrs_nosym++;
-      gindices_nosym += gather_occ[k];
-      
-    } else {
-      giaddrs_sym++;
-      gindices_sym += gather_occ[k];
-    }
-#endif
-    
-    gather_cnt += gather_icnt[k];
-  }
-  printf("done.\n");
-  //printf("\nTOP GATHERS\n");
-
-  //Get top gathers
-  gather_ntop = 0;
-  for(j=0; j<NTOP; j++) {
-    
-    bestcnt = 0;
-    best_iaddr = 0;
-    bestidx = -1;
-    
-    for(k=0; k<NGS; k++) {
-
-      if (gather_icnt[k] == 0)
-	continue;
-
-      if (gather_iaddrs[k] == 0) {
-	break;
-      }
-      
-      if (gather_icnt[k] > bestcnt) {
-	bestcnt = gather_icnt[k];
-	best_iaddr =  gather_iaddrs[k];
-	bestidx = k;
-      }
-      
-    }
-    
-    if (best_iaddr == 0) {
-      break;
-      
-    } else {
-
-      gather_ntop++;
-      //printf("GIADDR -- %016lx: %16lu -- %s\n",
-      //     gather_iaddrs[bestidx], gather_icnt[bestidx], gather_srcline[bestidx]);
-      
-      gather_top[j] = best_iaddr;
-      gather_top_idx[j] = bestidx;
-      gather_tot[j] = gather_icnt[bestidx];  
-      gather_icnt[bestidx] = 0;
-      
-    }
-  }
-
-  //Find source lines  
-  scatter_cnt = 0.0;
-  
-  printf("Symbol table lookup for scatters..."); fflush(stdout);
-  //Check it is not a library
-  for(k=0; k<NGS; k++) {
-    
-    if (scatter_iaddrs[k] == 0) {
-      break;
-    }
-    translate_iaddr(binary, scatter_srcline[k], scatter_iaddrs[k]);
-    
-#if SYMBOLS_ONLY
-    if (startswith(scatter_srcline[k], "?")) {
-      scatter_icnt[k] = 0;
-      siaddrs_nosym++;
-      sindices_nosym += scatter_occ[k];
-      
-    } else {
-      siaddrs_sym++;
-      sindices_sym += scatter_occ[k];
-    }
-#endif
-    
-    scatter_cnt += scatter_icnt[k];
-  }
-  printf("done.\n");
-  
-  //Get top scatters    
-  //printf("\nTOP SCATTERS\n");
-  scatter_ntop = 0;
-  for(j=0; j<NTOP; j++) {
-    
-    bestcnt = 0;
-    best_iaddr = 0;
-    bestidx = -1;
-    
-    for(k=0; k<NGS; k++) {
-
-      if (scatter_icnt[k] == 0)
-	continue;
-      
-      if (scatter_iaddrs[k] == 0) {
-	break;
-      }
-      
-      if (scatter_icnt[k] > bestcnt) {
-	bestcnt = scatter_icnt[k];
-	best_iaddr =  scatter_iaddrs[k];
-	bestidx = k;
-      }      
-    }
-    
-    if (best_iaddr == 0) {
-      break;
-      
-    } else {
-
-      scatter_ntop++;      
-      scatter_top[j] = best_iaddr;
-      scatter_top_idx[j] = bestidx;
-      scatter_tot[j] = scatter_icnt[bestidx];
-      scatter_icnt[bestidx] = 0;
-      //printf("SIADDR -- %016lx: %16lu -- %s\n",
-      //     scatter_top[j], scatter_tot[j], scatter_srcline[bestidx]);
-    }
-  }
-
-#if SYMBOLS_ONLY  
-  if (giaddrs_nosym || siaddrs_nosym) {
-    printf("\n");
-    printf("IGNORED NONSYMBOL STATS:\n");
-    printf("gather unique iaddrs:  %16ld\n", giaddrs_nosym); 
-    printf("gather total indices:  %16ld (%5.2f%c of 1st pass gathers)\n",
-	   gindices_nosym,
-	   100.0 * (double)gindices_nosym / (double)(gindices_nosym + gindices_sym),'%');  
-    printf("scatter unique iaddrs: %16ld\n", siaddrs_nosym); 
-    printf("scatter total indices: %16ld (%5.2f%c of 1st pass scatters)\n",
-	   sindices_nosym,
-	   100.0 * (double)sindices_nosym / (double)(sindices_nosym + sindices_sym),'%');
-    printf("\n");
-    printf("KEPT SYMBOL STATS:\n");
-    printf("gather unique iaddrs:  %16ld\n", giaddrs_sym); 
-    printf("gather total indices:  %16ld\n", gindices_sym);  
-    printf("scatter unique iaddrs: %16ld\n", siaddrs_sym); 
-    printf("scatter total indices: %16ld\n", sindices_sym);
-  }
-#endif
-  
-  //Second Pass
-  
-  //Open trace
-  fp_drtrace = gzopen(argv[1], "hrb");
-  if (fp_drtrace == NULL) {
-    printf("ERROR: Could not open %s!\n", argv[1]);
-    exit(-1);
-  }    
-  
-  mcnt = 0;
-  iret = 0;
-  p_drtrace = NULL;
-  int breakout = 0;
-  printf("\nSecond pass to fill gather / scatter subtraces\n"); fflush(stdout);
-  while ( drline_read(fp_drtrace, drtrace, &p_drtrace, &iret) && !breakout ) {
-    
-    //decode drtrace
-    drline = p_drtrace;
-    
-    /*****************************/
-    /** INSTR 0xa-0x10 and 0x1e **/
-    /*****************************/
-    if ( ((drline->type >= 0xa) && (drline->type <= 0x10)) || (drline->type == 0x1e) ) {
-      
-      //iaddr
-      iaddr = drline->addr;
-      
-      
-      /***********************/
-      /** MEM 0x00 and 0x01 **/
-      /***********************/
-    } else if ( (drline->type == 0x0) || (drline->type == 0x1) ) {
-      
-      maddr = drline->addr / drline->size;
-      
-      if ((++mcnt % PERSAMPLE) == 0) {
-	printf(".");
-	fflush(stdout);
-      }
-      
-      //gather ?
-      if (drline->type == 0x0) {
-	
-	for(i=0; i<gather_ntop; i++) {
-	  
-	  //found it
-	  if (iaddr == gather_top[i]) {
-	    gather_size[i] = drline->size;
-	    
-	    if (gather_base[i] == 0)
-	      gather_base[i] = maddr;
-	    
-	    //Add index
-	    if (gather_offset[i] >= PSIZE) {
-	      printf("WARNING: Need to increase PSIZE. Truncating trace...\n");
-	      breakout = 1;
-	      break;
-	    }
-	    //printf("g -- %d % d\n", i, gather_offset[i]); fflush(stdout);
-	    gather_patterns[i][ gather_offset[i]++ ] = (int64_t) (maddr - gather_base[i]);	
-	    
-	    break;
-	  }
-	}
-	
-	//scatter ?
-      } else {
-	
-	for(i=0; i<scatter_ntop; i++) {
-	  
-	  //found it
-	  if (iaddr == scatter_top[i]) {
-	    scatter_size[i] = drline->size;
-	    
-	    //set base
-	    if (scatter_base[i] == 0) 
-	      scatter_base[i] =  maddr;
-	    
-	    //Add index
-	    if (scatter_offset[i] >= PSIZE) {
-	      printf("WARNING: Need to increase PSIZE. Truncating trace...\n");
-	      breakout = 1;
-	      break;
-	    }
-	    scatter_patterns[i][ scatter_offset[i]++ ] = (int64_t) (maddr - scatter_base[i]);
-	    break;	      
-	  }
-	}
-      }
-      
-    } //MEM
-    
-    p_drtrace++;
-    
-  } //while drtrace
-  
-  gzclose(fp_drtrace);
-
-  printf("\n");
-  
-  //Normalize
-  int64_t smallest;
-  for(i=0; i<gather_ntop; i++) {
-    
-    //Find smallest
-    smallest = 0x7FFFFFFFFFFFFFFFL;
-    for(j=0; j<gather_offset[i]; j++) {
-      if (gather_patterns[i][j] < smallest)
-	smallest = gather_patterns[i][j];
-    }
-    
-    //Normalize
-    for(j=0; j<gather_offset[i]; j++) {
-      gather_patterns[i][j] -= smallest;    
-    }
-  }
-  
-  for(i=0; i<scatter_ntop; i++) {
-    
-    //Find smallest
-    smallest = 0x7FFFFFFFFFFFFFFFL;
-    for(j=0; j<scatter_offset[i]; j++) {
-      if (scatter_patterns[i][j] < smallest)
-	smallest = scatter_patterns[i][j];
-    }
-    
-    //Normalize
-    for(j=0; j<scatter_offset[i]; j++) {
-      scatter_patterns[i][j] -= smallest;    
-    }
-  }    
-
-  //Create stride histogram and create spatter
-  int sidx;
-  int firstgs = 1;
-  int first_spatter = 1;
-  int ibin = 0;
-  int unique_strides;
-  int64_t idx, pidx, hbin ;
-  int64_t n_stride[OBOUNDS_ALLOC];
-  double outbounds;
-  //print
-
-
-  //Create spatter file
-  FILE * fp, * fp2;
-  char * json_name, * gs_info;
-  json_name = str_replace(argv[1], ".gz", ".json"); 
-  fp = fopen(json_name, "w");
-  if (fp == NULL) {
-    printf("ERROR: Could not open %s!\n", json_name);
-    exit(-1);
-  }
-  gs_info = str_replace(argv[1], ".gz", ".txt");
-  fp2 = fopen(gs_info, "w");
-  if (fp2 == NULL) {
-    printf("ERROR: Could not open %s!\n", gs_info);
-    exit(-1);
-  }
-
-  //Header
-  fprintf(fp, "[ ");
-  fprintf(fp2, "#iaddr, sourceline, type size bytes, g/s, nindices, final percentage of g/s\n");
-  
-  printf("\n");
-  for(i=0; i<gather_ntop; i++) {
-
-    unique_strides = 0;
-    for(j=0; j<OBOUNDS_ALLOC; j++)
-      n_stride[j] = 0;
-    
-    for(j=1; j<gather_offset[i]; j++) {
-      sidx = gather_patterns[i][j] - gather_patterns[i][j-1] + OBOUNDS + 1;
-      sidx = (sidx < 1) ? 0 : sidx;
-      sidx = (sidx > OBOUNDS_ALLOC - 1) ? OBOUNDS_ALLOC - 1 : sidx;
-      n_stride[sidx]++;
-    }
-
-    for(j=0; j<OBOUNDS_ALLOC; j++) {
-      if (n_stride[j] > 0) {
-	unique_strides++;
-      }
-    }
-
-    //percentage out of bounds
-    outbounds = (double) (n_stride[0] + n_stride[OBOUNDS_ALLOC-1]) / (double) gather_offset[i];
-    
-    if (((unique_strides > NSTRIDES) || (outbounds > OUTTHRESH)) && (gather_offset[i] > USTRIDES)) {
-
-      if (firstgs) {
-	firstgs = 0;
-	printf("***************************************************************************************\n");
-	printf("GATHERS\n");
-      }
-      printf("***************************************************************************************\n");
-      //create a binary file
-      FILE * fp_bin;
-      char bin_name[1024];
-      char * tmp_name;
-      
-      tmp_name = str_replace(argv[1], ".gz", "");
-      sprintf(bin_name, "%s.g.%03d.%02dB.sbin", tmp_name, i, gather_size[i]);
-      printf("%s\n", bin_name);
-      fp_bin = fopen(bin_name, "w");
-      if (fp_bin == NULL) {
-	printf("ERROR: Could not open %s!\n", bin_name);
-	exit(-1);
-      }
-      
-      printf("GIADDR   -- %p\n",  gather_top[i]);
-      printf("SRCLINE  -- %s\n", gather_srcline[ gather_top_idx[i] ] );
-      printf("GATHER %c -- %6.3f%c (%4d-bit chunks)\n",
-	     '%', 100.0 * (double) gather_tot[i] / gather_cnt, '%', VBITS);
-      printf("DTYPE      -- %d bytes\n", gather_size[i]);
-      printf("NINDICES   -- %ld\n", gather_offset[i]);
-      printf("INDICES:\n");
-      int64_t nlcnt = 0;
-      for(j=0; j<gather_offset[i]; j++) {
-	
-	if (j <= 49) {
-	  printf("%10ld ", gather_patterns[i][j]); fflush(stdout);
-	  if (( ++nlcnt % 10) == 0)
-	    printf("\n");
-	  
-	} else if (j >= (gather_offset[i] - 50)) {
-	  printf("%10ld ", gather_patterns[i][j]); fflush(stdout);
-	  if (( ++nlcnt % 10) == 0)
-	    printf("\n");
-	  
-	} else if (j == 50)
-	  printf("...\n");
-      }    
-      printf("\n");
-      printf("DIST HISTOGRAM --\n");
-
-      hbin = 0;
-      for(j=0; j<OBOUNDS_ALLOC; j++) {
-	
-	if (j == 0) {
-	  printf("( -inf, %5ld]: %ld\n", (int64_t)(-(VBITS+1)), n_stride[j]);
-	  hbin = 0;
-	  
-	} else if (j == OBOUNDS +1) {	    
-	  printf("[%5ld,     0): %ld\n", (int64_t)-VBITS, hbin);
-	  hbin = 0;
-	  
-	} else if (j == (OBOUNDS_ALLOC-2) ) {
-	  printf("[    0, %5ld]: %ld\n", VBITS, hbin);
-	  hbin = 0;	      
-	  
-	} else if (j == (OBOUNDS_ALLOC-1)) {
-	  printf("[%5ld,   inf): %ld\n", VBITS+1, n_stride[j]);
-	  
-	} else {
-	  hbin += n_stride[j];
-	}
-      }
-      
-      if (first_spatter) {
-	first_spatter = 0;
-	fprintf(fp, " {\"kernel\":\"Gather\", \"pattern\":[");
-		
-      } else {
-	fprintf(fp, ",\n {\"kernel\":\"Gather\", \"pattern\":[");
-      }
-
-      fwrite(gather_patterns[i], sizeof(uint64_t), gather_offset[i], fp_bin);
-      fclose(fp_bin);
-      
-      for(j=0; j<gather_offset[i]-1; j++)	
-	fprintf(fp, "%ld,",  gather_patterns[i][j]);
-      fprintf(fp, "%ld",  gather_patterns[i][gather_offset[i]-1]);
-
-      fprintf(fp, "], \"count\":1}");
-
-      fprintf(fp2, "%p,%s,%d,G,%ld,%6.3f\n",
-	      gather_top[i],gather_srcline[ gather_top_idx[i] ], gather_size[i],
-	      gather_offset[i], 100.0*(double) gather_tot[i] / gather_cnt);
-      free(tmp_name);
-      printf("***************************************************************************************\n\n"); 
-    }   
-  }
-  
-  printf("\n");
-  firstgs = 1;
-  for(i=0; i<scatter_ntop; i++) {
-    
-    unique_strides = 0;
-    for(j=0; j<OBOUNDS_ALLOC; j++)
-      n_stride[j] = 0;
-    
-    for(j=1; j<scatter_offset[i]; j++) {
-      sidx = scatter_patterns[i][j] - scatter_patterns[i][j-1] + OBOUNDS + 1;
-      sidx = (sidx < 1) ? 0 : sidx;
-      sidx = (sidx > OBOUNDS_ALLOC-1) ? OBOUNDS_ALLOC-1 : sidx;
-      n_stride[sidx]++;
-    }
-      
-    for(j=0; j<OBOUNDS_ALLOC; j++) {
-      if (n_stride[j] > 0) {
-	unique_strides++;
-      }
-    }
-    
-    outbounds = (double) (n_stride[0] + n_stride[OBOUNDS_ALLOC-1]) / (double) scatter_offset[i];
-    
-    if (((unique_strides > NSTRIDES) | (outbounds > OUTTHRESH)) && (scatter_offset[i] > USTRIDES)) {
-      
-      if (firstgs) {
-	firstgs = 0;
-	printf("***************************************************************************************\n");
-	printf("SCATTERS\n");
-      }
-      printf("***************************************************************************************\n");
-      //create a binary file
-      FILE * fp_bin;
-      char bin_name[1024];
-      char * tmp_name;
-      tmp_name = str_replace(argv[1], ".gz", "");
-      sprintf(bin_name, "%s.s.%03d.%02dB.sbin", tmp_name, i, scatter_size[i]);
-      printf("%s\n", bin_name);
-      fp_bin = fopen(bin_name, "w");
-      if (fp_bin == NULL) {
-	printf("ERROR: Could not open %s!\n", bin_name);
-	exit(-1);
-      }
-      
-      printf("SIADDR    -- %p\n",   scatter_top[i]);
-      printf("SRCLINE   -- %s\n", scatter_srcline[ scatter_top_idx[i]]);
-      printf("SCATTER %c -- %6.3f%c (%4ld-bit chunks)\n",
-	     '%', 100.0 * (double) scatter_tot[i] / scatter_cnt, '%', VBITS);
-      printf("DTYPE     -- %d bytes\n", scatter_size[i]);
-      printf("NINDICES  -- %ld\n", scatter_offset[i]);
-      printf("INDICES:\n");
-      
-      int64_t nlcnt = 0;
-      for(j=0; j<scatter_offset[i]; j++) {
-	
-	if (j <= 49) {
-	  printf("%10ld ", scatter_patterns[i][j]); fflush(stdout);
-	  if (( ++nlcnt % 10) == 0)
-	    printf("\n");
-	  
-	} else if (j >= (scatter_offset[i] - 50)) {
-	  printf("%10ld ", scatter_patterns[i][j]); fflush(stdout);
-	  if (( ++nlcnt % 10) == 0)
-	    printf("\n");
-	  
-	} else if (j == 50)
-	  printf("...\n");
-      }
-      printf("\n");
-      printf("DIST HISTOGRAM --\n");
-
-      hbin = 0;
-      for(j=0; j<OBOUNDS_ALLOC; j++) {
-	
-	if (j == 0) {
-	  printf("( -inf, %5ld]: %ld\n", (int64_t)(-(VBITS+1)), n_stride[j]);
-	  hbin = 0;
-	  
-	} else if (j == OBOUNDS +1) {	    
-	  printf("[%5ld,     0): %ld\n", (int64_t)-VBITS, hbin);
-	  hbin = 0;
-	  
-	} else if (j == (OBOUNDS_ALLOC-2) ) {
-	  printf("[    0, %5ld]: %ld\n", VBITS, hbin);
-	  hbin = 0;	      
-	  
-	} else if (j == (OBOUNDS_ALLOC-1)) {
-	  printf("[%5ld,   inf): %ld\n", VBITS+1, n_stride[j]);
-	  
-	} else {
-	  hbin += n_stride[j];
-	}
-      }
-      
-      if (first_spatter) {
-	first_spatter = 0;
-	fprintf(fp, " {\"kernel\":\"Scatter\", \"pattern\":[");
-		
-      } else {
-	fprintf(fp, ", {\"kernel\":\"Scatter\", \"pattern\":[");
-      }
-
-      fwrite(scatter_patterns[i], sizeof(uint64_t), scatter_offset[i], fp_bin);
-      fclose(fp_bin);
-      
-      for(j=0; j<scatter_offset[i]-1; j++)	
-	fprintf(fp, "%ld,",  scatter_patterns[i][j]);
-      fprintf(fp, "%ld",  scatter_patterns[i][scatter_offset[i]-1]);
-      fprintf(fp, "], \"count\":1}");
-      
-      fprintf(fp2, "%p,%s,%d,S,%ld,%6.3f\n",
-	      scatter_top[i],scatter_srcline[ scatter_top_idx[i] ], scatter_size[i],
-	      scatter_offset[i], 100.0*(double) scatter_tot[i] / scatter_cnt);
-      free(tmp_name);  
-      printf("***************************************************************************************\n\n");
-    }  
-  }
-
-  //Footer
-  fprintf(fp, " ]");
-  fclose(fp);
-  fclose(fp2);  
-  
-  for(i=0; i<NTOP; i++) {
-    free(gather_patterns[i]);
-    free(scatter_patterns[i]);
-  }
-
-  
-  return 0;
-  
-}
diff --git a/gs_patterns.h b/gs_patterns.h
new file mode 100644
index 0000000..f74b67c
--- /dev/null
+++ b/gs_patterns.h
@@ -0,0 +1,344 @@
+
+#pragma once
+
+#include <exception>
+#include <string>
+#include <cstring>
+#include <vector>
+
+//symbol lookup options
+#if !defined(SYMBOLS_ONLY)
+#define SYMBOLS_ONLY 1 //Filter out instructions that have no symbol
+#endif
+
+//triggers
+#define PERSAMPLE 10000000
+
+//info
+#define CLSIZE (64) //cacheline bytes
+#define NBUFS (1LL<<10) //trace reading buffer size
+#define IWINDOW (1024) //number of iaddrs per window
+#define NGS (8096) //max number for gathers and scatters
+#define OBOUNDS (512) //histogram positive max
+#define OBOUNDS_ALLOC (2*OBOUNDS + 3)
+
+//patterns
+#define USTRIDES 1024   //Threshold for number of accesses
+#define NSTRIDES 15     //Threshold for number of unique distances
+#define OUTTHRESH (0.5) //Threshold for percentage of distances at boundaries of histogram
+#define NTOP (10)       //Final gather / scatters to keep
+#define INITIAL_PSIZE (1<<15)
+#define MAX_PSIZE     (1<<30) //Max number of indices recorded per gather/scatter
+
+#define MAX_LINE_LENGTH 1024
+
+#if !defined(VBITS)
+# define VBITS (512L)
+# define VBYTES (VBITS/8)
+#endif
+
+namespace gs_patterns
+{
+    typedef uintptr_t addr_t;
+    typedef enum { GATHER=0, SCATTER } mem_access_type;
+    typedef enum { VECTOR=0, CTA } mem_instr_type;
+
+    class GSError : public std::exception
+    {
+    public:
+        GSError (const std::string & reason) : _reason(reason) { }
+        ~GSError() {}
+
+        const char * what() const noexcept override { return _reason.c_str(); }
+    private:
+        std::string _reason;
+    };
+
+    class GSFileError : public GSError
+    {
+    public:
+        GSFileError (const std::string & reason) : GSError(reason) { }
+        ~GSFileError() {}
+    };
+
+    class GSDataError : public GSError
+    {
+    public:
+        GSDataError (const std::string & reason) : GSError(reason) { }
+        ~GSDataError() {}
+    };
+
+    class GSAllocError : public GSError
+    {
+    public:
+        GSAllocError (const std::string & reason) : GSError(reason) { }
+        ~GSAllocError() {}
+    };
+
+    class InstrAddrAdapter
+    {
+    public:
+        InstrAddrAdapter() { }
+        virtual ~InstrAddrAdapter() { }
+
+        virtual bool            is_valid() const            = 0;
+        virtual bool            is_mem_instr() const        = 0;
+        virtual bool            is_other_instr() const      = 0;
+        virtual mem_access_type get_mem_access_type() const = 0;
+        virtual mem_instr_type  get_mem_instr_type() const  = 0;
+
+        virtual size_t         get_size() const             = 0;
+        virtual addr_t         get_base_addr() const        = 0;
+        virtual addr_t         get_address() const          = 0;
+        virtual addr_t         get_iaddr() const            = 0;
+        virtual addr_t         get_maddr() const            = 0;
+        virtual unsigned short get_type() const             = 0; // must be 0 for GATHER, 1 for SCATTER !!
+        virtual int64_t        get_max_access_size() const  = 0;
+
+        virtual bool is_gather() const
+        { return (is_valid() && is_mem_instr() && GATHER == get_mem_access_type()) ? true : false; }
+
+        virtual bool is_scatter() const
+        { return (is_valid() && is_mem_instr() && SCATTER == get_mem_access_type()) ? true : false; }
+
+        virtual void output(std::ostream & os) const      = 0;
+    };
+
+    std::ostream & operator<<(std::ostream & os, const InstrAddrAdapter & ia);
+
+
+    class Metrics
+    {
+    public:
+        Metrics(mem_access_type mType) : _mType(mType), _pattern_sizes(NTOP)
+        {
+            try
+            {
+                for (int j = 0; j < NTOP; j++) {
+                    patterns[j] = new int64_t[INITIAL_PSIZE];
+                    _pattern_sizes[j] = INITIAL_PSIZE;
+                }
+            }
+            catch (const std::exception & ex)
+            {
+                throw GSAllocError("Could not allocate patterns for " + type_as_string() + "! due to: " + ex.what());
+            }
+        }
+
+        ~Metrics()
+        {
+            for (int i = 0; i < NTOP; i++) {
+                delete [] patterns[i];
+            }
+
+            delete [] srcline;
+        }
+
+        size_t get_pattern_size(int pattern_index) {
+            return _pattern_sizes[pattern_index];
+        }
+
+        bool grow(int pattern_index) {
+            try {
+                size_t old_size = _pattern_sizes[pattern_index];
+                size_t new_size = old_size * 2;
+                if (new_size > MAX_PSIZE) {
+                    return false;
+                }
+
+                int64_t *tmp = new int64_t[new_size];
+                memcpy(tmp, patterns[pattern_index], old_size * sizeof(int64_t));
+
+                delete [] patterns[pattern_index];
+                patterns[pattern_index] = tmp;
+                _pattern_sizes[pattern_index] = new_size;
+
+                return true;
+            }
+            catch (...) {
+                return false;
+            }
+        }
+
+        Metrics(const Metrics &) = delete;
+        Metrics & operator=(const Metrics & right) = delete;
+
+        std::string type_as_string() { return !_mType ? "GATHER" : "SCATTER"; }
+        std::string getName()        { return !_mType ? "Gather" : "Scatter"; }
+        std::string getShortName()   { return !_mType ? "G" : "S"; }
+        std::string getShortNameLower()   { return !_mType ? "g" : "s"; }
+
+        auto get_srcline() { return srcline[_mType]; }
+
+        int      ntop = 0;
+        int64_t  iaddrs_nosym = 0;
+        int64_t  indices_nosym = 0;
+        int64_t  iaddrs_sym = 0;
+        int64_t  indices_sym = 0;
+        double   cnt = 0.0;
+        int      offset[NTOP]  = {0};
+        int      size[NTOP]  = {0};
+
+        addr_t   tot[NTOP]     = {0};
+        addr_t   top[NTOP]     = {0};
+        addr_t   top_idx[NTOP] = {0};
+
+        int64_t* patterns[NTOP] = {0};
+
+    private:
+        char (*srcline)[NGS][MAX_LINE_LENGTH] = new char[2][NGS][MAX_LINE_LENGTH];
+
+        mem_access_type _mType;
+
+        std::vector<size_t>  _pattern_sizes;
+    };
+
+
+    class InstrInfo
+    {
+    public:
+        InstrInfo(mem_access_type mType) : _mType(mType) { }
+        ~InstrInfo() {
+            delete [] iaddrs;
+            delete [] icnt;
+            delete [] occ;
+        }
+
+        InstrInfo(const InstrInfo &) = delete;
+        InstrInfo & operator=(const InstrInfo & right) = delete;
+
+        addr_t*  get_iaddrs() { return iaddrs[_mType]; }
+        int64_t* get_icnt()   { return icnt[_mType]; }
+        int64_t* get_occ()    { return occ[_mType]; }
+
+    private:
+        addr_t (*iaddrs)[NGS] = new addr_t[2][NGS];
+        int64_t (*icnt)[NGS]  = new int64_t[2][NGS]; //vector instances
+        int64_t (*occ)[NGS]   = new int64_t[2][NGS];  //load/store instances
+
+        mem_access_type _mType;
+    };
+
+    class TraceInfo  // Stats
+    {
+    public:
+        /// TODO: need a reset method to zero out counters
+
+        uint64_t opcodes     = 0;
+        uint64_t opcodes_mem = 0;
+        uint64_t addrs       = 0;
+        uint64_t other       = 0;
+        int64_t  ngs         = 0;
+        int64_t trace_lines  = 0;
+
+        bool    did_opcode      = false; // revist this ---------------
+        double  other_cnt       = 0.0;
+        double  gather_score    = 0.0;
+        double  gather_occ_avg  = 0.0;
+        double  scatter_occ_avg = 0.0;
+
+        uint64_t     mcnt  = 0;
+    };
+
+    template <std::size_t MAX_ACCESS_SIZE>
+    class InstrWindow
+    {
+    public:
+        InstrWindow() {
+            // First dimension is 0=GATHER/1=SCATTER
+            _w_iaddrs = new int64_t[2][IWINDOW];
+            _w_bytes  = new int64_t[2][IWINDOW];
+            _w_maddr  = new int64_t[2][IWINDOW][MAX_ACCESS_SIZE];
+            _w_cnt    = new int64_t[2][IWINDOW];
+
+            init();
+        }
+
+        virtual ~InstrWindow() {
+            delete [] _w_iaddrs;
+            delete [] _w_bytes;
+            delete [] _w_maddr;
+            delete [] _w_cnt;
+        }
+
+        void init() {
+            for (int w = 0; w < 2; w++) {
+                for (int i = 0; i < IWINDOW; i++) {
+                    _w_iaddrs[w][i] = -1;
+                    _w_bytes[w][i] = 0;
+                    _w_cnt[w][i] = 0;
+                    for (uint64_t j = 0; j < MAX_ACCESS_SIZE; j++)
+                        _w_maddr[w][i][j] = -1;
+                }
+            }
+        }
+
+        void reset(int w) {
+            for (int i = 0; i < IWINDOW; i++) {
+                _w_iaddrs[w][i] = -1;
+                _w_bytes[w][i] = 0;
+                _w_cnt[w][i] = 0;
+                for (uint64_t j = 0; j < MAX_ACCESS_SIZE; j++)
+                    _w_maddr[w][i][j] = -1;
+            }
+        }
+
+        void reset() {
+            for (int w = 0; w < 2; w++) {
+                reset(w);
+            }
+        }
+
+        InstrWindow(const InstrWindow &) = delete;
+        InstrWindow & operator=(const InstrWindow & right) = delete;
+
+        int64_t & w_iaddrs(int32_t i, int32_t j)             { return _w_iaddrs[i][j];   }
+        int64_t & w_bytes(int32_t i, int32_t j)              { return _w_bytes[i][j];    }
+        int64_t & w_maddr(int32_t i, int32_t j, int32_t k)   { return _w_maddr[i][j][k]; }
+        int64_t & w_cnt(int32_t i, int32_t j)                { return _w_cnt[i][j];      }
+
+        addr_t &  get_iaddr()       { return iaddr;      }
+        int64_t & get_maddr_prev()  { return maddr_prev; }
+        int64_t & get_maddr()       { return maddr;      }
+
+    private:
+        // First dimension is 0=GATHER/1=SCATTER
+        int64_t (*_w_iaddrs)[IWINDOW];
+        int64_t (*_w_bytes)[IWINDOW];
+        int64_t (*_w_maddr)[IWINDOW][MAX_ACCESS_SIZE];
+        int64_t (*_w_cnt)[IWINDOW];
+
+        // State which must be carried with each call to handle a trace
+        addr_t   iaddr;
+        int64_t  maddr_prev;
+        int64_t  maddr;
+    };
+
+    template <std::size_t MAX_ACCESS_SIZE>
+    class MemPatterns
+    {
+    public:
+        MemPatterns() { }
+        virtual ~MemPatterns() { };
+
+        MemPatterns(const MemPatterns &) = delete;
+        MemPatterns & operator=(const MemPatterns &) = delete;
+
+        virtual void handle_trace_entry(const InstrAddrAdapter & ia) = 0;
+        virtual void generate_patterns() = 0;
+
+        virtual Metrics &     get_metrics(mem_access_type) = 0;
+        virtual InstrInfo &   get_iinfo(mem_access_type)   = 0;
+
+        virtual Metrics &     get_gather_metrics()      = 0;
+        virtual Metrics &     get_scatter_metrics()     = 0;
+        virtual InstrInfo &   get_gather_iinfo()        = 0;
+        virtual InstrInfo &   get_scatter_iinfo()       = 0;
+        virtual TraceInfo &   get_trace_info()          = 0;
+        virtual InstrWindow<MAX_ACCESS_SIZE> &
+                              get_instr_window()        = 0;
+        virtual void          set_log_level(int8_t ll)  = 0;
+        virtual int8_t        get_log_level()           = 0;
+    };
+
+} // namespace gs_patterns
diff --git a/gs_patterns_core.cpp b/gs_patterns_core.cpp
new file mode 100644
index 0000000..b9b0354
--- /dev/null
+++ b/gs_patterns_core.cpp
@@ -0,0 +1,364 @@
+
+#include <assert.h> /// TODO: use cassert instead
+#include <math.h>
+
+#include <string>
+#include <sstream>
+
+#include "utils.h"
+#include "gs_patterns.h"
+
+namespace gs_patterns
+{
+namespace gs_patterns_core
+{
+    using namespace gs_patterns;
+
+    void translate_iaddr(const std::string & binary, char * source_line, addr_t iaddr)
+    {
+        char path[MAX_LINE_LENGTH];
+        char cmd[MAX_LINE_LENGTH];
+        FILE *fp;
+
+        sprintf(cmd, "addr2line -e %s 0x%lx", binary.c_str(), iaddr);
+
+        /* Open the command for reading. */
+        fp = popen(cmd, "r");
+        if (NULL == fp) {
+            throw GSError("Failed to run command");
+        }
+
+        /* Read the output a line at a time - output it. */
+        while (fgets(path, sizeof(path), fp) != NULL) {
+            strcpy(source_line, path);
+            source_line[strcspn(source_line, "\n")] = 0;
+        }
+
+        /* close */
+        pclose(fp);
+
+        return;
+    }
+
+    void create_metrics_file(FILE * fp, FILE * fp2, const std::string & file_prefix, Metrics & target_metrics, bool & first_spatter)
+    {
+        int i = 0;
+        int j = 0;
+
+        //Create stride histogram and create spatter
+        int sidx;
+	int firstgs = 1;
+        int unique_strides;
+	int64_t hbin = 0;
+        int64_t n_stride[OBOUNDS_ALLOC];
+        double outbounds;
+
+        if (file_prefix.empty()) throw GSFileError ("Empty file prefix provided.");
+
+        if (first_spatter) printf("\n");
+
+        printf("\n");
+        for (i = 0; i < target_metrics.ntop; i++) {
+            printf("***************************************************************************************\n");
+
+            unique_strides = 0;
+            for (j = 0; j < OBOUNDS_ALLOC; j++)
+                n_stride[j] = 0;
+
+            for (j = 1; j < target_metrics.offset[i]; j++) {
+                sidx = target_metrics.patterns[i][j] - target_metrics.patterns[i][j - 1] + OBOUNDS + 1;
+                sidx = (sidx < 1) ? 0 : sidx;
+                sidx = (sidx > OBOUNDS_ALLOC - 1) ? OBOUNDS_ALLOC - 1 : sidx;
+                n_stride[sidx]++;
+            }
+
+            for (j = 0; j < OBOUNDS_ALLOC; j++) {
+                if (n_stride[j] > 0) {
+                    unique_strides++;
+                }
+            }
+
+            outbounds = (double) (n_stride[0] + n_stride[OBOUNDS_ALLOC-1]) / (double) target_metrics.offset[i];
+
+            if (((unique_strides > NSTRIDES) || (outbounds > OUTTHRESH)  && (target_metrics.offset[i] > USTRIDES ) )) {
+		//if (true) {
+
+	        if (firstgs) {
+	  	  firstgs = 0;
+		  printf("***************************************************************************************\n");
+		  printf("%sS\n", target_metrics.type_as_string().c_str());
+	        }
+	        printf("***************************************************************************************\n");
+                //create a binary file
+                FILE * fp_bin;
+		
+		char bin_name[1024];
+		sprintf(bin_name, "%s.%s.%03d.%02dB.sbin", file_prefix.c_str(), target_metrics.getShortNameLower().c_str(), \
+			i, target_metrics.size[i]);
+		printf("%s\n", bin_name);
+                //std::string bin_name =				\
+		//  file_prefix + "." + target_metrics.getShortNameLower().c_str() + "." + std::to_string(i) + "." + \
+		//  std::to_string(target_metrics.size[i]) + "B.sbin";
+				
+                fp_bin = fopen(bin_name, "w");
+                if (NULL == fp_bin) 
+                    throw GSFileError("Could not open " + std::string(bin_name) + "!");             
+
+                printf("%sIADDR    -- %p\n", target_metrics.getShortName().c_str(), (void*) target_metrics.top[i]);
+                printf("SRCLINE   -- %s\n", target_metrics.get_srcline()[target_metrics.top_idx[i]]);
+                printf("GATHER %c -- %6.3f%c (%4ld-bit chunks)\n",
+	               '%', 100.0 * (double) target_metrics.tot[i] / target_metrics.cnt, '%', VBITS);
+                printf("DTYPE      -- %d bytes\n", target_metrics.size[i]);
+                printf("NINDICES   -- %d\n", target_metrics.offset[i]);
+                printf("INDICES:\n");
+      
+                int64_t nlcnt = 0;
+                for (j = 0; j < target_metrics.offset[i]; j++) {
+	
+		  if (j <= 49) {
+		    printf("%10ld ", target_metrics.patterns[i][j]);
+		    fflush(stdout);
+		    if (( ++nlcnt % 10) == 0)
+		      printf("\n");
+	  
+		  } else if (j >= (target_metrics.offset[i] - 50)) {
+		    printf("%10ld ", target_metrics.patterns[i][j]);
+		    fflush(stdout);
+		    if (( ++nlcnt % 10) == 0)
+		      printf("\n");
+		    
+		  } else if (j == 50)
+		    printf("...\n");
+                }
+                printf("\n");
+                printf("DIST HISTOGRAM --\n");
+
+	        hbin = 0;
+	        for(j=0; j<OBOUNDS_ALLOC; j++) {
+	
+		 if (j == 0) {
+		   printf("( -inf, %5ld]: %ld\n", (int64_t)(-(VBITS+1)), n_stride[j]);
+		   hbin = 0;
+	  
+		 } else if (j == OBOUNDS +1) {	    
+		   printf("[%5ld,     0): %ld\n", (int64_t)-VBITS, hbin);
+		   hbin = 0;
+	  
+		 } else if (j == (OBOUNDS_ALLOC-2) ) {
+		   printf("[    0, %5ld]: %ld\n", VBITS, hbin);
+		   hbin = 0;	      
+	  
+		 } else if (j == (OBOUNDS_ALLOC-1)) {
+		   printf("[%5ld,   inf): %ld\n", VBITS+1, n_stride[j]);
+	  
+		 } else {
+		   hbin += n_stride[j];
+		 }
+	       }
+
+                if (first_spatter) {
+                    first_spatter = false;
+                    fprintf(fp, " {\"kernel\":\"%s\", \"pattern\":[", target_metrics.getName().c_str());
+                } else {
+                    fprintf(fp, ",\n {\"kernel\":\"%s\", \"pattern\":[", target_metrics.getName().c_str());
+                }
+
+                fwrite(target_metrics.patterns[i], sizeof(uint64_t), target_metrics.offset[i], fp_bin);
+                fclose(fp_bin);
+
+                for (j = 0; j < target_metrics.offset[i] - 1; j++)
+                    fprintf(fp, "%ld,", target_metrics.patterns[i][j]);
+                fprintf(fp, "%ld", target_metrics.patterns[i][target_metrics.offset[i] - 1]);
+                fprintf(fp, "], \"count\":1}");
+
+		fprintf(fp2, "0x%lx,%s,%d,%s,%d,%6.3f\n",
+			target_metrics.top[i],
+			target_metrics.get_srcline()[target_metrics.top_idx[i]],
+			target_metrics.size[i],
+			target_metrics.getShortName().c_str(),
+			target_metrics.offset[i],
+                        100.0 * (double) target_metrics.tot[i] / target_metrics.cnt);		
+            }
+            printf("***************************************************************************************\n\n");
+        }
+    }
+
+    void normalize_stats(Metrics & target_metrics)
+    {
+        //Normalize
+        int64_t smallest;
+        for (int i = 0; i < target_metrics.ntop; i++) {
+
+            //Find smallest
+	    smallest = 0x7FFFFFFFFFFFFFFFL;
+	    for (int j = 0; j < target_metrics.offset[i]; j++) {
+                if (target_metrics.patterns[i][j] < smallest)
+                    smallest = target_metrics.patterns[i][j];
+            }
+
+            //Normalize
+            for (int j = 0; j < target_metrics.offset[i]; j++) {
+                target_metrics.patterns[i][j] -= smallest;
+            }
+        }
+    }
+
+    int get_top_target(InstrInfo & target_iinfo, Metrics & target_metrics)
+    {
+        int target_ntop = 0;
+
+        for (int j = 0; j < NTOP; j++)
+        {
+            int bestcnt = 0;
+            addr_t best_iaddr = 0;
+            int bestidx = -1;
+
+            for (int k = 0; k < NGS; k++)
+            {
+                if (target_iinfo.get_icnt()[k] == 0)
+                    continue;
+
+                if (target_iinfo.get_iaddrs()[k] == 0) {
+                    break;
+                }
+
+                if (target_iinfo.get_icnt()[k] > bestcnt) {
+                    bestcnt = target_iinfo.get_icnt()[k];
+                    best_iaddr = target_iinfo.get_iaddrs()[k];
+                    bestidx = k;
+                }
+            }
+
+            if (best_iaddr == 0)
+            {
+                break;
+            }
+            else
+            {
+                target_ntop++;
+                target_metrics.top[j] = best_iaddr;
+                target_metrics.top_idx[j] = bestidx;
+                target_metrics.tot[j] = target_iinfo.get_icnt()[bestidx];
+                target_iinfo.get_icnt()[bestidx] = 0;
+
+                //printf("%sIADDR -- %016lx: %16lu -- %s\n", target_metrics.getShortName().c_str(), target_metrics.top[j], target_metrics.tot[j], target_metrics.get_srcline()[bestidx]);
+            }
+        } // for
+
+        return target_ntop;
+    }
+
+    bool handle_2nd_pass_trace_entry(const InstrAddrAdapter & ia,
+                                     Metrics & gather_metrics, Metrics & scatter_metrics,
+                                     addr_t & iaddr, int64_t & maddr, uint64_t & mcnt,
+                                     addr_t * gather_base, addr_t * scatter_base)
+    {
+        int iret = 0;
+        int i = 0;
+
+        bool breakout = false;
+
+        /*****************************/
+        /** INSTR 0xa-0x10 and 0x1e **/
+        /*****************************/
+        if (!ia.is_valid()) {
+            std::ostringstream os;
+            os << "Invalid " << ia;
+            throw GSDataError(os.str());
+        }
+
+        if (ia.is_other_instr())
+        {
+            iaddr = ia.get_iaddr();  // was get_address in orig code  -> get_iaddr()
+        }
+        else if (ia.is_mem_instr())
+        {
+            /***********************/
+            /** MEM               **/
+            /***********************/
+
+            maddr = ia.get_maddr();
+
+            if (CTA == ia.get_mem_instr_type() && ia.get_address() == ia.get_base_addr()) {
+                iaddr = ia.get_iaddr();
+            }
+
+            if ((++mcnt % PERSAMPLE) == 0) {
+                printf(".");
+                fflush(stdout);
+            }
+
+            // gather ?
+            if (GATHER == ia.get_mem_access_type())
+            {
+                for (i = 0; i < gather_metrics.ntop; i++)
+                {
+                    //found it
+                    if (iaddr == gather_metrics.top[i])
+                    {
+		      
+		        gather_metrics.size[i] = ia.get_size();
+			
+                        if (gather_base[i] == 0)
+                            gather_base[i] = maddr;
+
+                        //Add index
+                        if (gather_metrics.offset[i] >= gather_metrics.get_pattern_size(i)) {
+                            if (!gather_metrics.grow(i)) {
+                                printf("WARNING: Unable to increase PSIZE. Truncating trace...\n");
+                                breakout = true;
+				break;
+                            }
+                        }
+                        gather_metrics.patterns[i][gather_metrics.offset[i]++] = (int64_t) (maddr - gather_base[i]);
+                        break;
+                    }
+                }
+            }
+            // scatter ?
+            else if (SCATTER == ia.get_mem_access_type())
+            {
+                for (i = 0; i < scatter_metrics.ntop; i++)
+                {
+                    //found it
+                    if (iaddr == scatter_metrics.top[i])
+                    {
+		        scatter_metrics.size[i] = ia.get_size();
+			
+                        //set base
+                        if (scatter_base[i] == 0)
+                            scatter_base[i] = maddr;
+
+                        //Add index
+                        if (scatter_metrics.offset[i] >= scatter_metrics.get_pattern_size(i)) {
+                            if (!scatter_metrics.grow(i)) {
+                                printf("WARNING: Unable to increase PSIZE. Truncating trace...\n");
+                                breakout = true;
+				break;
+                            }
+                        }
+                        scatter_metrics.patterns[i][scatter_metrics.offset[i]++] = (int64_t) (maddr - scatter_base[i]);
+                        break;
+                    }
+                }
+            }
+            else
+            { // belt and suspenders, yep = but helps to validate correct logic in children of InstrAddresInfo
+                throw GSDataError("Unknown Memory Access Type: " + std::to_string(ia.get_mem_access_type()));
+            }
+        } // MEM
+
+        return breakout;
+    }
+
+} // namespace gs_patterns_core
+
+std::ostream & operator<<(std::ostream & os, const gs_patterns::InstrAddrAdapter & ia)
+{
+    ia.output(os);
+    return os;
+}
+
+} // namespace gs_patterns
+
+
diff --git a/gs_patterns_core.h b/gs_patterns_core.h
new file mode 100644
index 0000000..8af3219
--- /dev/null
+++ b/gs_patterns_core.h
@@ -0,0 +1,316 @@
+
+#pragma once
+
+#include <stdio.h>
+#include <assert.h> /// TODO: use cassert instead
+#include <math.h>
+#include <string>
+
+#include "gs_patterns.h"
+
+namespace gs_patterns
+{
+namespace gs_patterns_core
+{
+    void translate_iaddr(const std::string & binary, char * source_line, addr_t iaddr);
+
+    template <typename std::size_t T>
+    void handle_trace_entry(MemPatterns<T> & mp, const InstrAddrAdapter & ia)
+    {
+        int i, j, k, w = 0;
+        int w_rw_idx;   // Index into instruction window first dimension (RW: 0=Gather(R) or 1=Scatter(W))
+        int w_idx;
+        int gs;
+
+        auto & trace_info = mp.get_trace_info();
+        auto & gather_iinfo = mp.get_gather_iinfo();
+        auto & scatter_iinfo = mp.get_scatter_iinfo();
+        auto & gather_metrics = mp.get_gather_metrics();
+        auto & scatter_metrics = mp.get_scatter_metrics();
+        auto & iw = mp.get_instr_window();
+
+        if (!ia.is_valid()) {
+            std::ostringstream os;
+            os << "Invalid " << ia;
+            throw GSDataError(os.str());
+        }
+
+        if (ia.is_other_instr())
+        {
+            /*****************************/
+            /** INSTR                   **/
+            /*****************************/
+
+            iw.get_iaddr() = ia.get_iaddr(); // was get_address in orig code -> get_iaddr()
+
+            //nops
+            trace_info.opcodes++;
+            trace_info.did_opcode = true;
+        }
+        else if (ia.is_mem_instr())
+        {
+            /***********************/
+            /** MEM instruction   **/
+            /***********************/
+
+            if (CTA == ia.get_mem_instr_type() && ia.get_base_addr() == ia.get_address()) {
+                iw.get_iaddr() = ia.get_iaddr();
+                trace_info.opcodes++;
+                trace_info.did_opcode = true;
+            }
+            w_rw_idx = ia.get_type();
+
+            //printf("M DRTRACE -- iaddr: %016lx addr: %016lx cl_start: %d bytes: %d\n",
+            //     iw.iaddr,  ia.get_address(), ia.get_address() % 64, ia.get_size());
+
+            if ((++trace_info.mcnt % PERSAMPLE) == 0) {
+                printf(".");
+                fflush(stdout);
+            }
+
+            //is iaddr in window
+            w_idx = -1;
+            for (i = 0; i < IWINDOW; i++) {
+
+                //new iaddr
+                if (iw.w_iaddrs(w_rw_idx, i) == -1) {
+                    w_idx = i;
+                    break;
+
+                    //iaddr exists
+                } else if (iw.w_iaddrs(w_rw_idx, i) == iw.get_iaddr()) {
+                    w_idx = i;
+                    break;
+                }
+            }
+
+            //new window
+            if ((w_idx == -1) || (iw.w_bytes(w_rw_idx, w_idx) >= ia.get_max_access_size()) ||   // was >= VBYTES
+                (iw.w_cnt(w_rw_idx, w_idx) >= ia.get_max_access_size())) {                      // was >= VBYTES
+
+                /***************************/
+                // do analysis
+                /***************************/
+                // i = each window
+                for (w = 0; w < 2; w++) {  // 2
+
+                    for (i = 0; i < IWINDOW; i++) {  // 1024
+
+                        if (iw.w_iaddrs(w,i) == -1)
+                            break;
+
+                        //int byte = iw.w_bytes(w, i) / iw.w_cnt(w, i);
+
+                        // First pass - Determine gather/scatter?
+                        gs = -1;
+                        for (j = 0; j < iw.w_cnt(w, i); j++) {
+
+                            // address and cl
+                            iw.get_maddr() = iw.w_maddr(w, i, j);
+                            assert(iw.get_maddr() > -1);
+
+                            // previous addr
+                            if (j == 0)
+                                iw.get_maddr_prev() = iw.get_maddr() - 1;
+
+                            // gather / scatter potential
+                            if (iw.get_maddr() != iw.get_maddr_prev()) {
+			      // ? > 1 stride (non-contiguous)   <--------------------
+                                if ((gs == -1) && (abs(iw.get_maddr() - iw.get_maddr_prev()) > 1)) 
+                                    gs = w;
+                            }
+                            iw.get_maddr_prev() = iw.get_maddr();
+                        }
+
+			//Once a gather/scatter, always a gather/scatter
+			if (gs == -1) {
+			  
+			  InstrInfo & target_iinfo = (w == 0) ? gather_iinfo : scatter_iinfo;			    
+			  for(k=0; k<NGS; k++) {
+			    
+			    //end
+			    if (target_iinfo.get_iaddrs()[k] == 0)
+			      break;
+			    
+			    if (target_iinfo.get_iaddrs()[k] == iw.w_iaddrs(w, i)) {
+			      gs = w;
+			      break;
+			    }
+			  }
+			}		   			
+
+                        // Update other_cnt
+                        if (gs == -1)
+			  trace_info.other_cnt++;
+			//trace_info.other_cnt += iw.w_cnt(w, i);
+
+                        // GATHER or SCATTER handling
+			int did_record = 0;
+                        if (gs == 0 || gs == 1) {
+                            InstrInfo & target_iinfo = (gs == 0) ? gather_iinfo : scatter_iinfo;
+
+                            if (gs == 0) {
+                                trace_info.gather_occ_avg += iw.w_cnt(w, i);
+                                gather_metrics.cnt += 1.0;
+                            }
+                            else {
+                                trace_info.scatter_occ_avg += iw.w_cnt(w, i);
+                                scatter_metrics.cnt += 1.0;
+                            }
+
+                            for (k = 0; k < NGS; k++) {
+                                if (target_iinfo.get_iaddrs()[k] == 0) {
+                                    target_iinfo.get_iaddrs()[k] = iw.w_iaddrs(w, i);
+                                    (target_iinfo.get_icnt()[k])++;
+                                    target_iinfo.get_occ()[k] += iw.w_cnt(w, i);
+				    did_record = 1;
+                                    break;
+                                }
+
+                                if (target_iinfo.get_iaddrs()[k] == iw.w_iaddrs(w, i)) {
+                                    (target_iinfo.get_icnt()[k])++;
+                                    target_iinfo.get_occ()[k] += iw.w_cnt(w, i);
+				    did_record = 1;
+                                    break;
+                                }
+                            }
+			    assert(did_record == 1);
+                        } // - if
+                    } //WINDOW i - for
+
+                    w_idx = 0;
+
+                    // reset windows
+                    iw.reset(w);
+                } // rw w - for
+            } // analysis - if
+
+            // Set window values
+            iw.w_iaddrs(w_rw_idx, w_idx) = iw.get_iaddr();
+            iw.w_maddr(w_rw_idx, w_idx, iw.w_cnt(w_rw_idx, w_idx)) = ia.get_maddr();
+            iw.w_bytes(w_rw_idx, w_idx) += ia.get_size();
+
+            // num access per iaddr in loop
+            iw.w_cnt(w_rw_idx, w_idx)++;
+
+            if (trace_info.did_opcode) {
+
+                trace_info.opcodes_mem++;
+                trace_info.addrs++;
+                trace_info.did_opcode = false;
+
+            } else {
+                trace_info.addrs++;
+            }
+	    
+        } else {
+            /***********************/
+            /** SOMETHING ELSE **/
+            /***********************/
+
+            trace_info.other++;
+        }
+
+        trace_info.trace_lines++;
+    }
+
+    template <typename std::size_t T>
+    void display_stats(MemPatterns<T> & mp)
+    {
+        printf("\n RESULTS \n");
+
+        printf("DRTRACE STATS\n");
+        printf("DRTRACE LINES:        %16lu\n", mp.get_trace_info().trace_lines);
+        printf("OPCODES:              %16lu\n", mp.get_trace_info().opcodes);
+        printf("MEMOPCODES:           %16lu\n", mp.get_trace_info().opcodes_mem);
+        printf("LOAD/STORES:          %16lu\n", mp.get_trace_info().addrs);
+        printf("OTHER:                %16lu\n", mp.get_trace_info().other);
+
+        printf("\n");
+
+        printf("FIRST PASS GATHER/SCATTER STATS: \n");
+        printf("LOADS per GATHER:     %16.3f\n", mp.get_trace_info().gather_occ_avg);
+        printf("STORES per SCATTER:   %16.3f\n", mp.get_trace_info().scatter_occ_avg);
+        printf("GATHER COUNT:         %16.3f (log2)\n", log(mp.get_gather_metrics().cnt) / log(2.0));
+        printf("SCATTER COUNT:        %16.3f (log2)\n", log(mp.get_scatter_metrics().cnt) / log(2.0));
+        printf("OTHER  COUNT:         %16.3f (log2)\n", log(mp.get_trace_info().other_cnt) / log(2.0));
+	
+#if SYMBOLS_ONLY
+	if (mp.get_gather_metrics().iaddrs_nosym || mp.get_scatter_metrics().iaddrs_nosym) {
+	  printf("\n");
+	  printf("IGNORED NONSYMBOL STATS:\n");
+	  printf("gather unique iaddrs:  %16ld\n", mp.get_gather_metrics().iaddrs_nosym); 
+	  printf("gather total indices:  %16ld (%5.2f%c of 1st pass gathers)\n",
+		 mp.get_gather_metrics().indices_nosym,
+		 100.0 * (double)mp.get_gather_metrics().indices_nosym /
+		 (double)(mp.get_gather_metrics().indices_nosym + mp.get_gather_metrics().indices_sym),
+		 '%');  
+	  printf("scatter unique iaddrs: %16ld\n", mp.get_scatter_metrics().iaddrs_nosym); 
+	  printf("scatter total indices: %16ld (%5.2f%c of 1st pass scatters)\n",
+		 mp.get_scatter_metrics().indices_nosym,
+		 100.0 * (double)mp.get_scatter_metrics().indices_nosym /
+		 (double)(mp.get_scatter_metrics().indices_nosym + mp.get_scatter_metrics().indices_sym),'%');
+	  printf("\n");
+	  printf("KEPT SYMBOL STATS:\n");
+	  printf("gather unique iaddrs:  %16ld\n", mp.get_scatter_metrics().iaddrs_sym); 
+	  printf("gather total indices:  %16ld\n", mp.get_scatter_metrics().indices_sym);  
+	  printf("scatter unique iaddrs: %16ld\n", mp.get_scatter_metrics().iaddrs_sym); 
+	  printf("scatter total indices: %16ld\n", mp.get_scatter_metrics().indices_sym);
+	}
+#endif
+  
+    }
+
+    int get_top_target(InstrInfo & target_iinfo, Metrics & target_metrics);
+
+    void normalize_stats(Metrics & target_metrics);
+
+    bool handle_2nd_pass_trace_entry(const InstrAddrAdapter & ia,
+                                     Metrics & gather_metrics, Metrics & scatter_metrics,
+                                     addr_t & iaddr, int64_t & maddr, uint64_t & mcnt,
+                                     addr_t * gather_base, addr_t * scatter_base);
+
+    void create_metrics_file(FILE * fp,
+                             FILE * fp2,
+                             const std::string & file_prefix,
+                             Metrics & target_metrics,
+                             bool & first_spatter);
+
+    template <typename std::size_t T>
+    void create_spatter_file(MemPatterns<T> & mp, const std::string & file_prefix)
+    {
+        // Create spatter file
+        FILE *fp, *fp2;
+
+        if (file_prefix.empty()) throw GSFileError ("Empty file prefix provided.");
+
+        std::string json_name = file_prefix + ".json";
+        fp = fopen(json_name.c_str(), "w");
+        if (NULL == fp) {
+            throw GSFileError("Could not open " + json_name + "!");
+        }
+
+        std::string gs_info = file_prefix + ".txt";
+        fp2 = fopen(gs_info.c_str(), "w");
+        if (NULL == fp2) {
+            throw GSFileError("Could not open " + gs_info + "!");
+        }
+
+        // Header
+        fprintf(fp, "[ ");
+	fprintf(fp2, "#iaddr, sourceline, type size bytes, g/s, nindices, final percentage of g/s\n");
+
+        bool first_spatter = true;
+        create_metrics_file(fp, fp2, file_prefix, mp.get_gather_metrics(), first_spatter);
+
+        create_metrics_file(fp, fp2, file_prefix, mp.get_scatter_metrics(), first_spatter);
+
+        // Footer
+        fprintf(fp, " ]");
+        fclose(fp);
+        fclose(fp2);
+    }
+
+} // gs_patterns_core
+
+} // gs_patterns
diff --git a/gs_patterns_main.cpp b/gs_patterns_main.cpp
new file mode 100644
index 0000000..3794c28
--- /dev/null
+++ b/gs_patterns_main.cpp
@@ -0,0 +1,118 @@
+#include <stdexcept>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <exception>
+
+#include "gs_patterns.h"
+#include "gs_patterns_core.h"
+#include "gspin_patterns.h"
+#include "gsnv_patterns.h"
+#include "utils.h"
+
+#define GSNV_CONFIG_FILE "GSNV_CONFIG_FILE"
+
+using namespace gs_patterns;
+using namespace gs_patterns::gs_patterns_core;
+using namespace gs_patterns::gsnv_patterns;
+using namespace gs_patterns::gspin_patterns;
+
+void usage (const std::string & prog_name)
+{
+    std::cerr << "Usage: " << prog_name << " <pin_trace.gz> <prog_bin> \n"
+              << "       " << prog_name << " <nvbit_trace.gz> -nv [-ow] [-v]" << std::endl;
+}
+
+int main(int argc, char ** argv)
+{
+    try
+    {
+        bool use_gs_nv = false;
+        bool verbose = false;
+        bool one_warp = false;
+        for (int i = 0; i < argc; i++) {
+            if (std::string(argv[i]) == "-nv") {
+                use_gs_nv = true;
+            }
+            else if (std::string(argv[i]) == "-v") {
+                verbose = true;
+            }
+            else if (std::string(argv[i]) == "-ow") {
+                one_warp = true;
+            }
+        }
+
+        size_t pos = std::string(argv[0]).find_last_of("/");
+        std::string prog_name = std::string(argv[0]).substr(pos+1);
+
+        if (argc < 3) {
+            usage(prog_name);
+            throw GSError("Invalid program arguments");
+        }
+
+        if (use_gs_nv)
+        {
+            MemPatternsForNV mp;
+
+            mp.set_trace_file(argv[1]);
+
+            const char * config_file = std::getenv(GSNV_CONFIG_FILE);
+            if (config_file) {
+                mp.set_config_file(config_file);
+            }
+            if (verbose) mp.set_log_level(1);
+            if (one_warp) mp.set_one_warp_mode(one_warp);
+
+            // ----------------- Process Traces -----------------
+
+            mp.process_traces();
+
+            // ----------------- Generate Patterns -----------------
+
+            mp.generate_patterns();
+        }
+        else
+        {
+            MemPatternsForPin mp;
+
+            mp.set_trace_file(argv[1]);
+            mp.set_binary_file(argv[2]);
+            if (verbose) mp.set_log_level(1);
+
+            // ----------------- Process Traces -----------------
+
+            mp.process_traces();
+
+            // ----------------- Generate Patterns -----------------
+
+            mp.generate_patterns();
+        }
+    }
+    catch (const GSFileError & ex)
+    {
+        std::cerr << "ERROR: <GSFileError> " << ex.what() << std::endl;
+        exit(-1);
+    }
+    catch (const GSAllocError & ex)
+    {
+        std::cerr << "ERROR: <GSAllocError> " << ex.what() << std::endl;
+        exit(-1);
+    }
+    catch (const GSDataError & ex)
+    {
+        std::cerr << "ERROR: <GSDataError> " << ex.what() << std::endl;
+        exit(1);
+    }
+    catch (const GSError & ex)
+    {
+        std::cerr << "ERROR: <GSError> " << ex.what() << std::endl;
+        exit(1);
+    }
+    catch (const std::exception & ex)
+    {
+        std::cerr << "ERROR: " << ex.what() << std::endl;
+        exit(-1);
+    }
+
+    return 0;
+}
diff --git a/gsnv_patterns.cpp b/gsnv_patterns.cpp
new file mode 100644
index 0000000..11e84e5
--- /dev/null
+++ b/gsnv_patterns.cpp
@@ -0,0 +1,862 @@
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <map>
+#include <unordered_map>
+#include <set>
+#include <algorithm>
+
+#include <zlib.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <cmath>
+#include <string.h>
+#include <algorithm>
+
+#include "gs_patterns.h"
+#include "gs_patterns_core.h"
+#include "gsnv_patterns.h"
+#include "utils.h"
+#include "nvbit_tracing/gsnv_trace/common.h"
+
+// Enable to use a vector for storing trace data for use by second pass (if not defined data is stored to a temp file
+//#define USE_VECTOR_FOR_SECOND_PASS 1
+
+#define HEX(x)                                                            \
+    "0x" << std::setfill('0') << std::setw(16) << std::hex << (uint64_t)x \
+         << std::dec
+
+namespace gs_patterns
+{
+namespace gsnv_patterns
+{
+
+using namespace gs_patterns::gs_patterns_core;
+
+int tline_read_header(gzFile fp, trace_header_t * val, trace_header_t **p_val, int *edx)
+{
+
+    int idx;
+
+    idx = (*edx) / sizeof(trace_header_t);
+    //first read
+    if (NULL == *p_val) {
+        *edx = gzread(fp, val, sizeof(trace_header_t));
+        *p_val = val;
+    }
+    else if (*p_val == &val[idx]) {
+        *edx = gzread(fp, val, sizeof(trace_header_t));
+        *p_val = val;
+    }
+
+    if (0 == *edx)
+        return 0;
+
+    return 1;
+}
+
+int tline_read_maps(gzFile fp, trace_map_entry_t * val, trace_map_entry_t **p_val, int *edx)
+{
+
+    int idx;
+
+    idx = (*edx) / sizeof(trace_map_entry_t);
+    //first read
+    if (NULL == *p_val) {
+        *edx = gzread(fp, val, sizeof(trace_map_entry_t));
+        *p_val = val;
+    }
+    else if (*p_val == &val[idx]) {
+        *edx = gzread(fp, val, sizeof(trace_map_entry_t));
+        *p_val = val;
+    }
+
+    if (0 == *edx)
+        return 0;
+
+    return 1;
+}
+
+int tline_read(gzFile fp, mem_access_t * val, mem_access_t **p_val, int *edx)
+{
+
+    int idx;
+
+    idx = (*edx) / sizeof(mem_access_t);
+    //first read
+    if (NULL == *p_val) {
+        *edx = gzread(fp, val, sizeof(mem_access_t) * NBUFS);
+        *p_val = val;
+
+    } else if (*p_val == &val[idx]) {
+        *edx = gzread(fp, val, sizeof(mem_access_t) * NBUFS);
+        *p_val = val;
+    }
+
+    if (0 == *edx)
+        return 0;
+
+    return 1;
+}
+
+Metrics & MemPatternsForNV::get_metrics(mem_access_type m)
+{
+    switch (m)
+    {
+        case GATHER : return _metrics.first;
+        case SCATTER : return _metrics.second;
+        default:
+            throw GSError("Unable to get Metrics - Invalid Metrics Type: " + std::to_string(m));
+    }
+}
+
+InstrInfo & MemPatternsForNV::get_iinfo(mem_access_type m)
+{
+    switch (m)
+    {
+        case GATHER : return _iinfo.first;
+        case SCATTER : return _iinfo.second;
+        default:
+            throw GSError("Unable to get InstrInfo - Invalid Metrics Type: " + std::to_string(m));
+    }
+}
+
+void MemPatternsForNV::handle_trace_entry(const InstrAddrAdapter & ia)
+{
+    // Call libgs_patterns
+    gs_patterns_core::handle_trace_entry(*this, ia);
+
+    const InstrAddrAdapterForNV &ianv = dynamic_cast<const InstrAddrAdapterForNV &> (ia);
+#ifdef USE_VECTOR_FOR_SECOND_PASS
+    _traces.push_back(ianv);
+#else
+    if (std::fwrite(reinterpret_cast<const char *>(&ianv.get_trace_entry()), sizeof(trace_entry_t), 1, _tmp_dump_file) != 1)
+    {
+        throw GSFileError("Write of trace to temp file failed");
+    }
+#endif
+}
+
+void MemPatternsForNV::generate_patterns()
+{
+    if (_traces_handled < 1) {
+        std::cout << "No traces match criteria, skipping pattern generation" << std::endl;
+        return;
+    }
+
+    // ----------------- Write out Trace Files (if requested ) -----------------
+
+    write_trace_out_file();
+
+    // ----------------- Update Source Lines -----------------
+
+     update_source_lines();
+
+    // ----------------- Update Metrics -----------------
+
+    update_metrics();
+
+    // ----------------- Create Spatter File -----------------
+
+    create_spatter_file<MEMORY_ACCESS_SIZE>(*this, get_file_prefix());
+
+}
+
+void MemPatternsForNV::update_metrics()
+{
+    // Get top gathers
+    get_gather_metrics().ntop = get_top_target(get_gather_iinfo(), get_gather_metrics());
+
+    // Get top scatters
+    get_scatter_metrics().ntop = get_top_target(get_scatter_iinfo(), get_scatter_metrics());
+
+    // ----------------- Second Pass -----------------
+
+    process_second_pass();
+
+    // ----------------- Normalize -----------------
+
+    normalize_stats(get_gather_metrics());
+    normalize_stats(get_scatter_metrics());
+}
+
+std::string MemPatternsForNV::get_file_prefix()
+{
+    if (!_file_prefix.empty()) return _file_prefix;
+
+    // If no file_prefix was set try extracting one from trace_file
+    std::string prefix = _trace_file_name;
+    size_t pos = std::string::npos;
+    while (std::string::npos != (pos = prefix.find(".gz")))
+    {
+        prefix.replace(pos, 3, "");
+    }
+    return prefix;
+}
+
+// Store opcode mappings
+bool MemPatternsForNV::add_or_update_opcode(int opcode_id, const std::string & opcode)
+{
+    auto it = _id_to_opcode_map.find(opcode_id);
+    if (it == _id_to_opcode_map.end()) {
+        _id_to_opcode_map[opcode_id] = opcode;
+        //std::cout << "OPCODE: " << opcode_id << " -> " << opcode << std::endl;
+        return true;
+    }
+    return false;
+}
+
+// Retrieve opcode mapping by opcode_id
+const std::string & MemPatternsForNV::get_opcode(int opcode_id)
+{
+    auto result = _id_to_opcode_map.find(opcode_id);
+    if (result != _id_to_opcode_map.end()) {
+        return result->second;
+    }
+    std::stringstream ss;
+    ss << "Unknown opcode_id: " << opcode_id;
+    throw GSDataError(ss.str());
+}
+
+// Store opcode_short mappings
+bool MemPatternsForNV::add_or_update_opcode_short(int opcode_short_id, const std::string & opcode_short)
+{
+    auto it = _id_to_opcode_short_map.find(opcode_short_id);
+    if (it == _id_to_opcode_short_map.end()) {
+        _id_to_opcode_short_map[opcode_short_id] = opcode_short;
+        //std::cout << "OPCODE: " << opcode_id << " -> " << opcode << std::endl;
+        return true;
+    }
+    return false;
+}
+
+// Retrieve opcode_short mapping by opcode_short_id
+const std::string & MemPatternsForNV::get_opcode_short(int opcode_short_id)
+{
+    auto result = _id_to_opcode_short_map.find(opcode_short_id);
+    if (result != _id_to_opcode_short_map.end()) {
+        return result->second;
+    }
+    std::stringstream ss;
+    ss << "Unknown opcode_short_id: " << opcode_short_id;
+    throw GSDataError(ss.str());
+}
+
+// Store line  mappings
+bool MemPatternsForNV::add_or_update_line(int line_id, const std::string & line)
+{
+    auto it = _id_to_line_map.find(line_id);
+    if (it == _id_to_line_map.end()) {
+        _id_to_line_map[line_id] = line;
+        //std::cout << "LINE: " << line_id << " -> " << line << std::endl;
+        return true;
+    }
+    return false;
+}
+
+// Retrieve line number mapping by line_id
+const std::string & MemPatternsForNV::get_line(int line_id)
+{
+    auto result = _id_to_line_map.find(line_id);
+    if (result != _id_to_line_map.end()) {
+        return result->second;
+    }
+    std::stringstream ss;
+    ss << "Unknown line_id: " << line_id;
+    throw GSDataError(ss.str());
+}
+
+/*
+ * Read traces from a nvbit trace file. Includes header which describes opcode mappings used in trace data.
+ * Used by test runner (gsnv_test) to simulate nvbit execution.
+ */
+void MemPatternsForNV::process_traces()
+{
+    int iret = 0;
+    mem_access_t * t_line;
+
+    gzFile fp_trace;
+    try
+    {
+        fp_trace = open_trace_file(get_trace_file_name());
+    }
+    catch (const std::runtime_error & ex)
+    {
+        throw GSFileError(ex.what());
+    }
+
+    // Read header **
+    trace_header_t * p_header = NULL;
+    trace_header_t  header[1];
+    tline_read_header(fp_trace, header, &p_header, &iret);
+
+    uint32_t count = 0;
+    trace_map_entry_t * p_map_entry = NULL;
+    trace_map_entry_t map_entry[1];
+    while (count < p_header->num_map_entires && tline_read_maps(fp_trace, map_entry, &p_map_entry, &iret) ) {
+
+        if (_log_level >= 1) {
+            std::cout << "MAP: " << p_map_entry->map_name << " entry [" << p_map_entry->id << "] -> ["
+                      << p_map_entry->val << "]" << std::endl;
+        }
+
+        if (std::string(p_map_entry->map_name) == ID_TO_OPCODE) {
+            _id_to_opcode_map[p_map_entry->id] = p_map_entry->val;
+        }
+        else if (std::string(p_map_entry->map_name) == ID_TO_OPCODE_SHORT) {
+            _id_to_opcode_short_map[p_map_entry->id]  = p_map_entry->val;
+        }
+        else if (std::string(p_map_entry->map_name) == ID_TO_LINE) {
+            _id_to_line_map[p_map_entry->id]  = p_map_entry->val;
+        }
+        else {
+            std::cerr << "Unsupported Map: " << p_map_entry->map_name << " found in trace, ignoring ..."
+                      << p_map_entry->id << " -> " << p_map_entry->val << std::endl;
+        }
+
+        count++;
+        p_map_entry++;
+    }
+
+    // Read Traces **
+    iret = 0;
+    uint64_t lines_read = 0;
+    uint64_t pos = 0;
+    mem_access_t * p_trace = NULL;
+    mem_access_t trace_buff[NBUFS]; // was static (1024 bytes)
+    while (tline_read(fp_trace, trace_buff, &p_trace, &iret))
+    {
+        // Decode trace
+        t_line = p_trace;
+
+        if (-1 == t_line->cta_id_x) { continue; }
+
+        try
+        {
+            // Progress bar
+            if (lines_read == 0) {
+                for (int i = 0; i < 100; i++) { std::cout << "-"; }
+                std::cout << std::endl;
+            }
+            if (lines_read % ((uint64_t) std::max((p_header->total_traces * .01), 1.0)) == 0) {
+                if ((pos % 20) == 0) { std::cout << "|"; }
+                else { std::cout << "+"; }
+                std::flush(std::cout);
+                pos++;
+            }
+
+            handle_cta_memory_access(t_line);
+
+            p_trace++;
+            lines_read++;
+        }
+        catch (const GSError & ex) {
+            std::cerr << "ERROR: " << ex.what() << std::endl;
+            close_trace_file(fp_trace);
+            throw;
+        }
+    }
+
+    std::cout << "\nLines Read: " << lines_read << " of Total: " << p_header->total_traces << std::endl;
+
+    close_trace_file(fp_trace);
+
+    //metrics
+    get_trace_info().gather_occ_avg /= get_gather_metrics().cnt;
+    get_trace_info().scatter_occ_avg /= get_scatter_metrics().cnt;
+
+    display_stats<MEMORY_ACCESS_SIZE>(*this);
+
+}
+
+void MemPatternsForNV::update_source_lines()
+{
+    // Requires Kernel having been built with "--generate-line-info" so that trace file header contain mappings
+
+    // Find source lines for gathers
+    printf("\nSymbol table lookup for gathers...");
+    fflush(stdout);
+
+    get_gather_metrics().cnt = update_source_lines_from_binary(GATHER);
+
+    // Find source lines for scatters
+    printf("Symbol table lookup for scatters...");
+    fflush(stdout);
+
+    get_scatter_metrics().cnt = update_source_lines_from_binary(SCATTER);
+}
+
+double MemPatternsForNV::update_source_lines_from_binary(mem_access_type mType)
+{
+    double target_cnt = 0.0;
+
+    InstrInfo & target_iinfo   = get_iinfo(mType);
+    Metrics &   target_metrics = get_metrics(mType);
+
+    for (int k = 0; k < NGS; k++) {
+
+        if (0 == target_iinfo.get_iaddrs()[k]) {
+            break;
+        }
+
+        std::string line;
+        line = addr_to_line(target_iinfo.get_iaddrs()[k]);
+        strncpy(target_metrics.get_srcline()[k], line.c_str(), MAX_LINE_LENGTH-1);
+
+        if (std::string(target_metrics.get_srcline()[k]).empty())
+            target_iinfo.get_icnt()[k] = 0;
+
+        target_cnt += target_iinfo.get_icnt()[k];
+    }
+    printf("done.\n");
+
+    return target_cnt;
+
+}
+
+void MemPatternsForNV::process_second_pass()
+{
+    uint64_t mcnt = 0;  // used our own local mcnt while iterating over file in this method.
+
+    // State carried thru
+    addr_t iaddr;
+    int64_t maddr;
+    addr_t gather_base[NTOP] = {0};
+    addr_t scatter_base[NTOP] = {0};
+
+    bool breakout = false;
+    printf("\nSecond pass to fill gather / scatter subtraces\n");
+    fflush(stdout);
+
+#ifdef USE_VECTOR_FOR_SECOND_PASS
+    for (auto itr = _traces.begin(); itr != _traces.end(); ++itr)
+    {
+        InstrAddrAdapter & ia = *itr;
+
+        breakout = ::handle_2nd_pass_trace_entry(ia, get_gather_metrics(), get_scatter_metrics(),
+                                                 iaddr, maddr, mcnt, gather_base, scatter_base);
+        if (breakout) {
+            break;
+        }
+    }
+#else
+    std::fflush(_tmp_dump_file);
+    std::rewind(_tmp_dump_file); // Back to the future, ... sort of
+    try
+    {
+        trace_entry_t ta[TRACE_BUFFER_LENGTH];
+        size_t count_read = 0;
+        size_t read;
+        while ( !breakout && (read = std::fread(&ta, sizeof (ta[0]), TRACE_BUFFER_LENGTH, _tmp_dump_file)) )
+        {
+            for (int i = 0; i < read; i++)
+            {
+                InstrAddrAdapterForNV ia(const_cast<const trace_entry_t &>(ta[i]));
+                breakout = handle_2nd_pass_trace_entry(ia, get_gather_metrics(), get_scatter_metrics(),
+                                                         iaddr, maddr, mcnt, gather_base, scatter_base);
+                count_read++;
+
+                if (breakout) break;
+            }
+        }
+        std::cout << "Reread: " << count_read << " for second_pass " << std::endl;
+
+        if (!breakout && !std::feof(_tmp_dump_file)) {
+            if (std::ferror(_tmp_dump_file)) {
+                throw GSFileError("Unexpected error occurred while reading temp file");
+            }
+        }
+        std::fclose(_tmp_dump_file);
+    }
+    catch (const GSError & ex)
+    {
+        std::cerr << "ERROR: " << ex.what() << std::endl;
+        std::fclose(_tmp_dump_file);
+        throw;
+    }
+#endif
+}
+
+bool MemPatternsForNV::convert_to_trace_entry(const mem_access_t & ma,
+                                              bool ignore_partial_warps,
+                                              std::vector<trace_entry_t> & te_list)
+{
+    // Optionally, use traces from warp_id 0 only
+    if (_one_warp_mode && ma.warp_id != 0 )
+        return false;
+
+    uint16_t mem_size = ma.size;
+    uint16_t mem_type_code;
+
+    if (ma.is_load)
+        mem_type_code = GATHER;
+    else if (ma.is_store)
+        mem_type_code = SCATTER;
+    else
+        throw GSDataError ("Invalid mem_type must be LD(0) or ST(1)");
+
+    if (_id_to_opcode_short_map.find(ma.opcode_short_id) == _id_to_opcode_short_map.end())
+        return false;
+    std::string opcode_short = _id_to_opcode_short_map[ma.opcode_short_id];
+
+    if (_target_opcodes.find(opcode_short) == _target_opcodes.end())
+        return false;
+
+    // TODO: This is a SLOW way of doing this
+    const addr_t & base_addr = ma.addrs[0];
+    te_list.reserve(MemPatternsForNV::CTA_LENGTH);
+    for (int i = 0; i < MemPatternsForNV::CTA_LENGTH; i++)
+    {
+        if (ma.addrs[i] != 0)
+        {
+            trace_entry_t te { mem_type_code, mem_size, ma.addrs[i], base_addr, ma.iaddr };
+            te_list.push_back(te);
+
+            if (_addr_to_line_id.find(ma.iaddr) == _addr_to_line_id.end()) {
+                _addr_to_line_id[ma.iaddr] = ma.line_id;
+            }
+        }
+        else if (ignore_partial_warps)
+        {
+            // Ignore memory_accesses which have less than MemPatternsForNV::CTA_LENGTH
+            return false;
+        }
+    }
+    return true;
+}
+
+void MemPatternsForNV::handle_cta_memory_access(const mem_access_t * ma)
+{
+    if (exceed_max_count()) { return; }
+
+    if (!_first_trace_seen) {
+        _first_trace_seen = true;
+        printf("First pass to find top gather / scatter iaddresses\n");
+        fflush(stdout);
+
+#ifndef USE_VECTOR_FOR_SECOND_PASS
+        // Open an output file for dumping temp data used exclusively by second_pass
+        _tmp_dump_file = tmpfile();
+        if (!_tmp_dump_file) {
+            throw GSFileError("Unable to create a temp file for second pass");
+        }
+#endif
+    }
+
+    if (_write_trace_file && _ofs_tmp.is_open()) {
+        // Write entry to trace_output file
+        _ofs_tmp.write(reinterpret_cast<const char *>(ma), sizeof *ma);
+        _traces_written++;
+    }
+
+    if (_log_level >= 3) {
+        std::stringstream ss;
+        //ss << "CTX " << HEX(ctx) << " - grid_launch_id "
+        ss << "GSNV_TRACE: grid_launch_id: "
+           << ma->grid_launch_id << " - CTA: " << ma->cta_id_x << "," << ma->cta_id_y << "," << ma->cta_id_z
+           << " - warp: " << ma->warp_id
+           << " - iaddr: " << HEX(ma->iaddr)
+           << " line_id: " << ma->line_id
+           << " - " << get_opcode(ma->opcode_id)
+           << " - shortOpcode: " << ma->opcode_short_id
+           << " isLoad: " << ma->is_load << " isStore: " << ma->is_store
+           << " size: " << ma->size << " - ";
+
+        for (int i = 0; i < MemPatternsForNV::CTA_LENGTH; i++) {
+            ss << HEX(ma->addrs[i]) << " ";
+        }
+        std::cout << ss.str() << std::endl;
+    }
+
+    // Convert to vector of trace_entry_t if full warp. ignore partial warps.
+    std::vector<trace_entry_t> te_list;
+    te_list.reserve(MemPatternsForNV::CTA_LENGTH);
+
+    bool status = convert_to_trace_entry(*ma, true, te_list);
+    if (!status) return;
+
+    uint64_t min_size = !te_list.empty() ? (te_list[0].size) + 1 : 0;
+    if (min_size > 0 && valid_gs_stride(te_list, min_size))
+    {
+        for (auto it = te_list.begin(); it != te_list.end(); it++)
+        {
+            handle_trace_entry(InstrAddrAdapterForNV(*it));
+        }
+        _traces_handled++;
+    }
+}
+
+bool MemPatternsForNV::valid_gs_stride(const std::vector<trace_entry_t> & te_list, const uint32_t min_stride)
+{
+    uint32_t min_stride_found = INT32_MAX;
+    uint64_t last_addr = 0;
+    bool first = true;
+    for (auto it = te_list.begin(); it != te_list.end(); it++)
+    {
+        const trace_entry_t & te = *it;
+        if (first) {
+            first = false;
+            last_addr = te.addr;
+            continue;
+        }
+
+        uint64_t diff = std::llabs ((int64_t)(last_addr - te.addr));
+        if (diff < min_stride)
+            return false;
+
+        if (diff < min_stride_found)
+            min_stride_found = diff;
+
+        last_addr = te.addr;
+    }
+
+    return min_stride_found >= min_stride;
+}
+
+void MemPatternsForNV::set_trace_file(const std::string & trace_file_name)
+{
+    if (trace_file_name == _trace_out_file_name) {
+        throw GSError ("Cannot set trace input file to same name as trace output file [" + trace_file_name + "].");
+    }
+
+    _trace_file_name = trace_file_name;
+}
+
+void MemPatternsForNV::set_trace_out_file(const std::string & trace_out_file_name)
+{
+    try
+    {
+        if (trace_out_file_name.empty()) {
+            throw GSError ("Cannot set trace output file to empty filename [" + trace_out_file_name + "].");
+        }
+
+        if (trace_out_file_name == _trace_file_name) {
+            throw GSError ("Cannot set trace output file to same name as trace input file [" + trace_out_file_name + "].");
+        }
+
+        _trace_out_file_name     = trace_out_file_name;
+        _tmp_trace_out_file_name = _trace_out_file_name + ".tmp";
+
+        // Open a temp file for writing data
+        _ofs_tmp.open(_tmp_trace_out_file_name, std::ios::binary | std::ios::trunc | std::ios::in | std::ios::out);
+        if (!_ofs_tmp.is_open()) {
+            throw GSFileError("Unable to open " + _tmp_trace_out_file_name + " for writing");
+        }
+        std::remove(_tmp_trace_out_file_name.c_str());  // Force auto cleanup
+
+        // Open a ouput file for writing data header and appending data
+        _ofs.open(_trace_out_file_name, std::ios::binary | std::ios::trunc);
+        if (!_ofs.is_open()) {
+            throw GSFileError("Unable to open " + _trace_out_file_name + " for writing");
+        }
+
+        _write_trace_file = true;
+    }
+    catch (const std::exception & ex)
+    {
+        std::cerr << "ERROR: " << ex.what() << std::endl;
+        throw;
+    }
+}
+
+void MemPatternsForNV::write_trace_out_file()
+{
+    if (!_write_trace_file || !_first_trace_seen) return;
+
+    /// TODO: COMPRESS trace_file
+    try
+    {
+        std::cout << "\nSaving trace file - writing: " << _traces_written
+                  << " traces_handled: " << _traces_handled << " ... \n" << std::endl;
+
+        _ofs_tmp.flush();
+
+        // Write header
+        trace_header_t header;
+        header.num_maps        = NUM_MAPS;
+        header.num_map_entires = _id_to_opcode_map.size() +
+                                 _id_to_opcode_short_map.size() +
+                                 _id_to_line_map.size();
+        header.total_traces    = _traces_written;
+
+        _ofs.write(reinterpret_cast<const char *>(&header), sizeof header);
+
+        // Write Maps
+        trace_map_entry_t m_entry;
+        strncpy(m_entry.map_name, ID_TO_OPCODE, MAP_NAME_SIZE-1);
+        for (auto itr = _id_to_opcode_map.begin(); itr != _id_to_opcode_map.end(); itr++)
+        {
+            m_entry.id = itr->first;
+            strncpy(m_entry.val, itr->second.c_str(), MAP_VALUE_LONG_SIZE-1);
+            _ofs.write(reinterpret_cast<const char *>(&m_entry), sizeof m_entry);
+        }
+
+        strncpy(m_entry.map_name, ID_TO_OPCODE_SHORT, MAP_NAME_SIZE-1);
+        for (auto itr = _id_to_opcode_short_map.begin(); itr != _id_to_opcode_short_map.end(); itr++)
+        {
+            m_entry.id = itr->first;
+            strncpy(m_entry.val, itr->second.c_str(), MAP_VALUE_LONG_SIZE-1);
+            _ofs.write(reinterpret_cast<const char *>(&m_entry), sizeof m_entry);
+        }
+
+        strncpy(m_entry.map_name, ID_TO_LINE, MAP_NAME_SIZE-1);
+        for (auto itr = _id_to_line_map.begin(); itr != _id_to_line_map.end(); itr++)
+        {
+            m_entry.id = itr->first;
+            strncpy(m_entry.val, itr->second.c_str(), MAP_VALUE_LONG_SIZE-1);
+            _ofs.write(reinterpret_cast<const char *>(&m_entry), sizeof m_entry);
+        }
+        _ofs.flush();
+
+        // Write file contents
+        _ofs_tmp.seekp(0);
+        _ofs << _ofs_tmp.rdbuf();
+        _ofs.flush();
+        _ofs.close();
+        _ofs_tmp.close();
+
+        std::remove(_tmp_trace_out_file_name.c_str());
+
+        std::cout << "Saving trace file - complete" << std::endl;
+
+        if (_log_level >= 1) {
+            std::cout << "Mappings found" << std::endl;
+
+            std::cout << "-- OPCODE_ID to OPCODE MAPPING -- " << std::endl;
+            for (auto itr = _id_to_opcode_map.begin(); itr != _id_to_opcode_map.end(); itr++) {
+                std::cout << itr->first << " -> " << itr->second << std::endl;
+            }
+
+            std::cout << "-- OPCODE_SHORT_ID to OPCODE_SHORT MAPPING -- " << std::endl;
+            for (auto itr = _id_to_opcode_short_map.begin(); itr != _id_to_opcode_short_map.end(); itr++) {
+                std::cout << itr->first << " -> " << itr->second << std::endl;
+            }
+
+            std::cout << "-- LINE_ID to LINE MAPPING -- " << std::endl;
+            for (auto itr = _id_to_line_map.begin(); itr != _id_to_line_map.end(); itr++) {
+                std::cout << itr->first << " -> " << itr->second << std::endl;
+            }
+        }
+    }
+    catch (const std::exception & ex)
+    {
+        std::remove(_tmp_trace_out_file_name.c_str());
+        std::cerr << "ERROR: failed to write trace file: " << _trace_file_name << std::endl;
+        throw;
+    }
+}
+
+void MemPatternsForNV::set_max_trace_count(int64_t max_trace_count)
+{
+    if (max_trace_count < 0) {
+        throw GSError("Max Trace count must be greater than 0");
+    }
+    _max_trace_count = max_trace_count;
+    _limit_trace_count = true;
+
+    if (_log_level >= 1) {
+        std::cout << "Max Trace Count set to: " << _max_trace_count << std::endl;
+    }
+}
+
+void MemPatternsForNV::set_config_file(const std::string & config_file)
+{
+    _config_file_name = config_file;
+    std::ifstream ifs;
+    ifs.open(_config_file_name);
+    if (!ifs.is_open())
+        throw GSFileError("Unable to open config file: " + _config_file_name);
+
+    std::stringstream ss;
+    while (!ifs.eof())
+    {
+        std::string name;
+        std::string value;
+        ifs >> name >> value;
+        if (name.empty() || value.empty() || name[0] == '#')
+            continue;
+
+        ss << "CONFIG: name: " << name << " value: " << value << std::endl;
+
+        try {
+            if (GSNV_TARGET_KERNEL == name) {
+                _target_kernels.insert(value);
+            }
+            else if (GSNV_TRACE_OUT_FILE == name) {
+                set_trace_out_file(value);
+            }
+            else if (GSNV_FILE_PREFIX == name) {
+                set_file_prefix(value);
+            }
+            else if (GSNV_MAX_TRACE_COUNT == name) {
+                int64_t num_val = (int64_t) std::stoi(value);
+                set_max_trace_count(num_val);
+            }
+            else if (GSNV_LOG_LEVEL == name) {
+                int8_t level = atoi(value.c_str());
+                set_log_level(level);
+            }
+            else if (GSNV_ONE_WARP_MODE == name) {
+                int8_t val = atoi(value.c_str());
+                bool mode = val ? true : false;
+                set_one_warp_mode(mode);
+            }
+            else {
+                std::cerr << "Unknown setting <" << name << "> with value <" << value << "> "
+                          << "specified in config file: " << _config_file_name << " ignoring ..." << std::endl;
+            }
+        }
+        catch (const std::exception & ex) {
+            std::cerr << "Failed to set config setting <" << name << "> with value <" << value << "> "
+                      << "due to error: " << ex.what() << " ignoring ..." << std::endl;
+        }
+    }
+
+    if (_log_level >= 1) {
+        std::cout << ss.str();
+    }
+}
+
+bool MemPatternsForNV::should_instrument(const std::string & kernel_name)
+{
+    if (exceed_max_count()) { return false; }
+
+    // Instrument all if none specified
+    if (_target_kernels.size() == 0) {
+        if (_log_level >= 1) {
+            std::cout << "Instrumenting all <by default>: " << kernel_name << std::endl;
+        }
+        return true;
+    }
+
+    auto itr = _target_kernels.find (kernel_name);
+    if ( itr != _target_kernels.end())
+    {
+        if (_log_level >= 1) {
+            std::cout << "Instrumenting: " << kernel_name << std::endl;
+        }
+        return  true;
+    }
+    else {
+        // Try substring match
+        auto itr = std::find_if(_target_kernels.begin(), _target_kernels.end(),
+                                [kernel_name](const std::string & t_kernel) {
+                                    return (t_kernel.compare(kernel_name.substr(0, t_kernel.length())) == 0); } );
+
+        if (itr != _target_kernels.end())
+            return true;
+    }
+
+    if (_log_level >= 2) {
+        std::cout << "Not Instrumenting: " << kernel_name << std::endl;
+    }
+    return false;
+}
+
+} // namespace gsnv_patterns
+
+} // namespace gs_patterns
\ No newline at end of file
diff --git a/gsnv_patterns.h b/gsnv_patterns.h
new file mode 100644
index 0000000..133beb9
--- /dev/null
+++ b/gsnv_patterns.h
@@ -0,0 +1,263 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <map>
+#include <unordered_map>
+#include <set>
+#include <filesystem>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <cmath>
+#include <string.h>
+#include <algorithm>
+
+#include "gs_patterns.h"
+#include "gs_patterns_core.h"
+#include "utils.h"
+
+// Enable to use a vector for storing trace data for use by second pass (if not defined data is stored to a temp file
+//#define USE_VECTOR_FOR_SECOND_PASS 1
+
+#include "nvbit_tracing/gsnv_trace/common.h"
+
+namespace gs_patterns
+{
+namespace gsnv_patterns
+{
+    constexpr std::size_t MEMORY_ACCESS_SIZE = 2048 / 8;
+
+    struct _trace_entry_t {
+        unsigned short type; // 2 bytes: trace_type_t
+        unsigned short size;
+        union {
+            addr_t addr;
+            unsigned char length[sizeof(addr_t)];
+        };
+        addr_t         base_addr;
+        addr_t         iaddr;
+        char           padding[4];
+    }  __attribute__((packed));
+    typedef struct _trace_entry_t trace_entry_t;
+
+    #define MAP_NAME_SIZE 24
+    #define MAP_VALUE_SIZE 22
+    #define MAP_VALUE_LONG_SIZE 94
+    #define NUM_MAPS 3
+    // Setting this to fit within a 4k page e.g. 128 * 32 bytes <= 4k
+    #define TRACE_BUFFER_LENGTH 128
+
+    struct _trace_map_entry_t
+    {
+        // 32 bytes total
+        char     map_name[MAP_NAME_SIZE];
+        uint16_t id;
+        char     val[MAP_VALUE_LONG_SIZE];
+    };
+    typedef struct _trace_map_entry_t trace_map_entry_t;
+
+    struct _trace_header_t {
+        uint64_t  num_maps;
+        uint64_t  num_map_entires;
+        uint64_t  total_traces;
+    };
+    typedef struct _trace_header_t trace_header_t;
+
+
+    // An adapter for trace_entry_t (temporaritly untl replaced with nvbit memory detail type)
+    class InstrAddrAdapterForNV : public InstrAddrAdapter
+    {
+    public:
+        InstrAddrAdapterForNV(const trace_entry_t & te) : _te(te) { }
+
+        virtual ~InstrAddrAdapterForNV() { }
+
+        virtual inline bool            is_valid() const override            { return true;          }
+        virtual inline bool            is_mem_instr() const override        { return true;          }
+        virtual inline bool            is_other_instr() const override      { return false;         }
+        virtual inline mem_access_type get_mem_access_type() const override { return (_te.type == 0) ? GATHER : SCATTER; }
+        virtual inline mem_instr_type  get_mem_instr_type() const override  { return CTA;           }
+
+        virtual inline size_t          get_size() const override            { return _te.size;      } // in bytes
+        virtual inline addr_t          get_base_addr() const override       { return _te.base_addr; }
+        virtual inline addr_t          get_address() const override         { return _te.addr;      }
+        virtual inline addr_t          get_iaddr () const override          { return _te.iaddr;     }
+        virtual inline addr_t          get_maddr () const override          { return _te.addr;      } // was _base_addr
+        virtual inline unsigned short  get_type() const override            { return _te.type;      } // must be 0 for GATHER, 1 for SCATTER !!
+        virtual inline int64_t         get_max_access_size() const override { return  MEMORY_ACCESS_SIZE; } // 32 * 8 bytes
+
+        virtual void output(std::ostream & os) const override   {  os << "InstrAddrAdapterForNV: trace entry: type: ["
+                                                                      << _te.type << "] size: [" << _te.size << "]";  }
+
+        const trace_entry_t & get_trace_entry() const                       { return _te; }
+
+    private:
+        const trace_entry_t  _te;
+    };
+
+    class MemPatternsForNV : public MemPatterns<MEMORY_ACCESS_SIZE>
+    {
+    public:
+        static const uint8_t CTA_LENGTH = 32;
+
+        static constexpr const char * ID_TO_OPCODE         = "ID_TO_OPCODE";
+        static constexpr const char * ID_TO_OPCODE_SHORT   = "ID_TO_OPCODE_SHORT";
+        static constexpr const char * ID_TO_LINE           = "ID_TO_LINE";
+
+        static constexpr const char * GSNV_TARGET_KERNEL   = "GSNV_TARGET_KERNEL";
+        static constexpr const char * GSNV_TRACE_OUT_FILE  = "GSNV_TRACE_OUT_FILE";
+        static constexpr const char * GSNV_PROGRAM_BINARY  = "GSNV_PROGRAM_BINARY";
+        static constexpr const char * GSNV_FILE_PREFIX     = "GSNV_FILE_PREFIX";
+        static constexpr const char * GSNV_MAX_TRACE_COUNT = "GSNV_MAX_TRACE_COUNT";
+        static constexpr const char * GSNV_LOG_LEVEL       = "GSNV_LOG_LEVEL";
+        static constexpr const char * GSNV_ONE_WARP_MODE   = "GSNV_ONE_WARP_MODE";
+
+
+        MemPatternsForNV(): _metrics(GATHER, SCATTER),
+                            _iinfo(GATHER, SCATTER),
+                            _target_opcodes { "LD", "ST", "LDS", "STS", "LDG", "STG" }
+        { }
+
+        virtual ~MemPatternsForNV() override {  }
+
+        void handle_trace_entry(const InstrAddrAdapter & ia) override;
+        void generate_patterns() override;
+
+        Metrics &     get_metrics(mem_access_type) override;
+        InstrInfo &   get_iinfo(mem_access_type) override;
+
+        Metrics &     get_gather_metrics() override  { return _metrics.first;  }
+        Metrics &     get_scatter_metrics() override { return _metrics.second; }
+        InstrInfo &   get_gather_iinfo () override   { return _iinfo.first;    }
+        InstrInfo &   get_scatter_iinfo () override  { return _iinfo.second;   }
+        TraceInfo &   get_trace_info() override      { return _trace_info;     }
+
+        InstrWindow<MEMORY_ACCESS_SIZE> &
+                get_instr_window() override          { return _iw;             }
+
+        void          set_log_level(int8_t level) override      { _log_level = level;      }
+        int8_t        get_log_level() override                  { return _log_level;       }
+
+        void set_trace_file(const std::string & trace_file_name);
+        inline const std::string & get_trace_file_name()        { return _trace_file_name; }
+
+        inline void set_file_prefix(const std::string & prefix) { _file_prefix = prefix;   }
+        std::string get_file_prefix();
+
+        void set_one_warp_mode(bool val)                        { _one_warp_mode = val;    }
+
+        void set_max_trace_count(int64_t max_trace_count);
+        inline bool exceed_max_count() const {
+            if (_limit_trace_count && (_trace_info.trace_lines >= _max_trace_count)) {
+                return true;
+            }
+            return false;
+        }
+
+        // Mainly Called by nvbit kernel
+        void set_config_file (const std::string & config_file);
+
+        void update_metrics();
+
+        void process_traces();
+        void update_source_lines();
+        double update_source_lines_from_binary(mem_access_type);
+        void process_second_pass();
+
+        std::string addr_to_line(addr_t addr)
+        {
+            auto itr = _addr_to_line_id.find(addr);
+            if (itr != _addr_to_line_id.end()) {
+                auto it2 = _id_to_line_map.find(itr->second);
+                if (it2 != _id_to_line_map.end()) {
+                    return it2->second;
+                }
+            }
+            return std::string();
+        }
+
+        void set_trace_out_file(const std::string & trace_file_name);
+        void write_trace_out_file();
+
+        // Handle an nvbit CTA memory update
+        void handle_cta_memory_access(const mem_access_t * ma);
+        // Validate cta stride is within minimum
+        bool valid_gs_stride(const std::vector<trace_entry_t> & te_list, const uint32_t min_stride);
+
+        // TODO: Migrate these to template functions !
+        // -----------------------------------------------------------------
+
+        // Store opcode mappings
+        bool add_or_update_opcode(int opcode_id, const std::string & opcode);
+        // Retrieve opcode mapping by opcode_id
+        const std::string & get_opcode(int opcode_id);
+
+        // Store opcode_short mappings
+        bool add_or_update_opcode_short(int opcode_short_id, const std::string & opcode_short);
+        // Retrieve opcode_short mapping by opcode_short_id
+        const std::string & get_opcode_short(int opcode_short_id);
+
+        // Store line mappings
+        bool add_or_update_line(int line_id, const std::string & line);
+        // Retrieve line number mapping by line_id
+        const std::string & get_line(int line_id);
+
+        // -----------------------------------------------------------------
+
+        bool should_instrument(const std::string & kernel_name);
+
+        bool convert_to_trace_entry(const mem_access_t & ma, bool ignore_partial_warps, std::vector<trace_entry_t> & te_list);
+
+    private:
+
+        std::pair<Metrics, Metrics>        _metrics;
+        std::pair<InstrInfo, InstrInfo>    _iinfo;
+        TraceInfo                          _trace_info;
+        InstrWindow<MEMORY_ACCESS_SIZE>    _iw;
+
+        std::string                        _trace_file_name;            // Input compressed nvbit trace file
+        std::string                        _file_prefix;                // Used by gs_patterns_core to write out pattern files
+        std::string                        _trace_out_file_name;        // Ouput file containing nvbit traces encounterd if requested
+        std::string                        _tmp_trace_out_file_name;    // Temp file used to store traces before re-writing to _trace_out_filename
+
+        std::string                        _config_file_name;
+        std::set<std::string>              _target_kernels;
+        bool                               _limit_trace_count = false;
+        int64_t                            _max_trace_count   = 0;
+        uint64_t                           _traces_written    = 0;
+        uint64_t                           _traces_handled    = 0;
+
+        bool                               _write_trace_file  = false;
+        bool                               _first_trace_seen  = false;
+
+        int8_t                             _log_level         = 0;
+        bool                               _one_warp_mode     = false;
+
+        /* The output stream used to temporarily hold raw trace warp data (mem_access_t) before being writen to _trace_out_file_name */
+        std::fstream                       _ofs_tmp;
+        /* The output stream cooresponding to _trace_out_file_name. Used to store final nvbit trace data with header */
+        std::ofstream                      _ofs;
+
+    #ifdef USE_VECTOR_FOR_SECOND_PASS
+        /* A vector used to store intermediate trace records (trace_entry_t) exclusively for use by second pass
+           (instead of _tmp_dump_file if USE_VECTOR_FOR_SECOND_PASS is defined) */
+        std::vector<InstrAddrAdapterForNV> _traces;
+    #else
+        /* A temp file used to store intermediate trace records (trace_entry_t) exclusively for use by second pass */
+        std::FILE *                        _tmp_dump_file;
+    #endif
+
+        std::map<int, std::string>       _id_to_opcode_map;
+        std::map<int, std::string>       _id_to_opcode_short_map;
+        std::map<int, std::string>       _id_to_line_map;         // Contains source line_id to source line mappings
+        std::unordered_map<addr_t, int>  _addr_to_line_id;        // Contains address to line_id mappings
+        const std::set<std::string>      _target_opcodes;
+    };
+
+} // namespace gsnv_patterns
+
+} // namespace gs_patterns
diff --git a/gspin_patterns.cpp b/gspin_patterns.cpp
new file mode 100644
index 0000000..7aae7b9
--- /dev/null
+++ b/gspin_patterns.cpp
@@ -0,0 +1,261 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <zlib.h>
+
+#include <stdexcept>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <exception>
+
+#include "gs_patterns.h"
+#include "gs_patterns_core.h"
+#include "gspin_patterns.h"
+#include "utils.h"
+
+namespace gs_patterns
+{
+namespace gspin_patterns
+{
+
+using namespace gs_patterns::gs_patterns_core;
+
+int drline_read(gzFile fp, trace_entry_t * val, trace_entry_t ** p_val, int * edx)
+{
+
+    int idx;
+
+    idx = (*edx) / sizeof(trace_entry_t);
+    //first read
+    if (NULL == *p_val) {
+        *edx = gzread(fp, val, sizeof(trace_entry_t) * NBUFS);
+        *p_val = val;
+
+    } else if (*p_val == &val[idx]) {
+        *edx = gzread(fp, val, sizeof(trace_entry_t) * NBUFS);
+        *p_val = val;
+    }
+
+    if (0 == *edx)
+        return 0;
+
+    return 1;
+}
+
+Metrics & MemPatternsForPin::get_metrics(mem_access_type m)
+{
+    switch (m)
+    {
+        case GATHER : return _metrics.first;
+        case SCATTER : return _metrics.second;
+        default:
+            throw GSError("Unable to get Metrics - Invalid Metrics Type: " + std::to_string(m));
+    }
+}
+
+InstrInfo & MemPatternsForPin::get_iinfo(mem_access_type m)
+{
+    switch (m)
+    {
+        case GATHER : return _iinfo.first;
+        case SCATTER : return _iinfo.second;
+        default:
+            throw GSError("Unable to get InstrInfo - Invalid Metrics Type: " + std::to_string(m));
+    }
+}
+
+void MemPatternsForPin::handle_trace_entry(const InstrAddrAdapter & ia)
+{
+    // Call libgs_patterns
+    gs_patterns_core::handle_trace_entry(*this, ia);
+}
+
+void MemPatternsForPin::generate_patterns()
+{
+    // ----------------- Update Source Lines -----------------
+
+    //update_source_lines();
+
+    // ----------------- Update Metrics -----------------
+
+    update_metrics();
+
+    // ----------------- Create Spatter File -----------------
+
+    create_spatter_file<MEMORY_ACCESS_SIZE>(*this, get_file_prefix());
+
+}
+
+void MemPatternsForPin::update_metrics()
+{
+    gzFile fp_drtrace;
+    try
+    {
+        fp_drtrace = open_trace_file(get_trace_file_name());
+    }
+    catch (const std::runtime_error & ex)
+    {
+        throw GSFileError(ex.what());
+    }
+
+    // Get top gathers
+    get_gather_metrics().ntop = get_top_target(get_gather_iinfo(), get_gather_metrics());
+
+    // Get top scatters
+    get_scatter_metrics().ntop = get_top_target(get_scatter_iinfo(), get_scatter_metrics());
+
+    // ----------------- Second Pass -----------------
+
+    process_second_pass(fp_drtrace);
+
+    // ----------------- Normalize -----------------
+
+    normalize_stats(get_gather_metrics());
+    normalize_stats(get_scatter_metrics());
+
+    close_trace_file(fp_drtrace);
+}
+
+std::string MemPatternsForPin::get_file_prefix()
+{
+    std::string prefix = _trace_file_name;
+    size_t pos = std::string::npos;
+    while (std::string::npos != (pos = prefix.find(".gz")))
+    {
+        prefix.replace(pos, 3, "");
+    }
+    return prefix;
+}
+
+double MemPatternsForPin::update_source_lines_from_binary(mem_access_type mType)
+{
+    double target_cnt = 0.0;
+
+    InstrInfo & target_iinfo   = get_iinfo(mType);
+    Metrics &   target_metrics = get_metrics(mType);
+
+    //Check it is not a library
+    for (int k = 0; k < NGS; k++) {
+
+        if (0 == target_iinfo.get_iaddrs()[k]) {
+            break;
+        }
+	
+#if SYMBOLS_ONLY
+        translate_iaddr(get_binary_file_name(), target_metrics.get_srcline()[k], target_iinfo.get_iaddrs()[k]);
+        if (startswith(target_metrics.get_srcline()[k], "?")) {
+            target_iinfo.get_icnt()[k] = 0;
+	    target_metrics.iaddrs_nosym++;
+	    target_metrics.indices_nosym += target_iinfo.get_occ()[k];
+      
+	} else {
+	  target_metrics.iaddrs_sym++;
+	  target_metrics.indices_sym += target_iinfo.get_occ()[k];
+	}
+#endif
+
+        target_cnt += target_iinfo.get_icnt()[k];
+    }
+    printf("done.\n");
+
+    return target_cnt;
+}
+
+// First Pass
+void MemPatternsForPin::process_traces()
+{
+    int iret = 0;
+    trace_entry_t *drline;
+    gzFile fp_drtrace;
+
+    try
+    {
+        fp_drtrace = open_trace_file(get_trace_file_name());
+    }
+    catch (const std::runtime_error & ex)
+    {
+        throw GSFileError(ex.what());
+    }
+
+    printf("First pass to find top gather / scatter iaddresses\n");
+    fflush(stdout);
+
+    uint64_t lines_read = 0;
+    trace_entry_t *p_drtrace = NULL;
+    trace_entry_t drtrace[NBUFS];  // was static (1024 bytes)
+
+    
+    while (drline_read(fp_drtrace, drtrace, &p_drtrace, &iret)) {
+        //decode drtrace
+        drline = p_drtrace;
+
+        handle_trace_entry(InstrAddrAdapterForPin(drline));
+
+        p_drtrace++;
+        lines_read++;
+    }
+
+    std::cout << "Lines Read: " << lines_read << std::endl;
+
+    close_trace_file(fp_drtrace);
+
+    //metrics
+    get_trace_info().gather_occ_avg /= get_gather_metrics().cnt;
+    get_trace_info().scatter_occ_avg /= get_scatter_metrics().cnt;
+
+    // ----------------- Update Source Lines -----------------
+
+    update_source_lines();
+
+    display_stats<MEMORY_ACCESS_SIZE>(*this);
+
+}
+
+void MemPatternsForPin::process_second_pass(gzFile & fp_drtrace)
+{
+    uint64_t mcnt = 0;  // used our own local mcnt while iterating over file in this method.
+    int iret = 0;
+    trace_entry_t *drline;
+
+    // State carried thru
+    addr_t iaddr;
+    int64_t maddr;
+    addr_t gather_base[NTOP] = {0};
+    addr_t scatter_base[NTOP] = {0};
+
+    bool breakout = false;
+    printf("\nSecond pass to fill gather / scatter subtraces\n");
+    fflush(stdout);
+
+    trace_entry_t *p_drtrace = NULL;
+    trace_entry_t drtrace[NBUFS];   // was static (1024 bytes)
+
+    while (drline_read(fp_drtrace, drtrace, &p_drtrace, &iret) && !breakout) {
+        //decode drtrace
+        drline = p_drtrace;
+
+        breakout = handle_2nd_pass_trace_entry(InstrAddrAdapterForPin(drline), get_gather_metrics(), get_scatter_metrics(),
+                                                 iaddr, maddr, mcnt, gather_base, scatter_base);
+
+        p_drtrace++;
+    }
+}
+
+void MemPatternsForPin::update_source_lines()
+{
+    // Find source lines for gathers - Must have symbol
+    printf("\nSymbol table lookup for gathers...");
+    fflush(stdout);
+
+    get_gather_metrics().cnt = update_source_lines_from_binary(GATHER);
+
+    // Find source lines for scatters
+    printf("Symbol table lookup for scatters...");
+    fflush(stdout);
+
+    get_scatter_metrics().cnt = update_source_lines_from_binary(SCATTER);
+}
+
+} // namespace gspin_patterns
+
+} // namespace gs_patterns
diff --git a/gspin_patterns.h b/gspin_patterns.h
new file mode 100644
index 0000000..78d64de
--- /dev/null
+++ b/gspin_patterns.h
@@ -0,0 +1,140 @@
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <zlib.h>
+
+#include <stdexcept>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <exception>
+
+#include "gs_patterns.h"
+#include "gs_patterns_core.h"
+#include "utils.h"
+
+#if !defined(SYMBOLS_ONLY)
+#define SYMBOLS_ONLY 1 //Filter out instructions that have no symbol
+#endif
+
+#if !defined(VBITS)
+# define VBITS (512L)
+# define VBYTES (VBITS/8)
+#endif
+
+namespace gs_patterns
+{
+namespace gspin_patterns
+{
+    constexpr std::size_t MEMORY_ACCESS_SIZE = VBYTES;
+
+    //FROM DR SOURCE
+    //DR trace
+    struct _trace_entry_t {
+        unsigned short type; // 2 bytes: trace_type_t
+        unsigned short size;
+        union {
+            addr_t addr;
+            unsigned char length[sizeof(addr_t)];
+        };
+    }  __attribute__((packed));
+    typedef struct _trace_entry_t trace_entry_t;
+
+    // An adapter for trace_entry_t
+    class InstrAddrAdapterForPin : public InstrAddrAdapter
+    {
+    public:
+        InstrAddrAdapterForPin(const trace_entry_t * te)
+        {
+            /// TODO: do we need to copy this, will we outlive trace_entry_t which is passed in ?
+            _te.type = te->type;
+            _te.size = te->size;
+            _te.addr = te->addr;
+        }
+        InstrAddrAdapterForPin(const trace_entry_t te) : _te(te) { }
+
+        virtual ~InstrAddrAdapterForPin() { }
+
+        virtual inline bool            is_valid() const override            { return !(0 == _te.type && 0 == _te.size);        }
+        virtual inline bool            is_mem_instr() const override        { return ((_te.type == 0x0) || (_te.type == 0x1)); }
+        virtual inline bool            is_other_instr() const override      { return ((_te.type >= 0xa) && (_te.type <= 0x10)) || (_te.type == 0x1e); }
+
+        virtual mem_access_type get_mem_access_type() const override        {
+            if (!is_mem_instr()) throw GSDataError("Not a Memory Instruction - unable to determine Access Type");
+            // Must be 0x0 or 0x1
+            if (_te.type == 0x0) return GATHER;
+            else return SCATTER;
+        }
+        virtual inline mem_instr_type  get_mem_instr_type() const override  { return VECTOR;   }
+
+        virtual inline size_t          get_size() const override            { return _te.size; }
+        virtual inline addr_t          get_base_addr() const override       { return _te.addr; }
+        virtual inline addr_t          get_address() const override         { return _te.addr; }
+        virtual inline addr_t          get_iaddr() const override           { return _te.addr; }
+        virtual inline addr_t          get_maddr() const override           { return _te.addr / _te.size; }
+        virtual inline unsigned short  get_type() const override            { return _te.type; } // must be 0 for GATHER, 1 for SCATTER !!
+        virtual inline int64_t         get_max_access_size() const override { return MEMORY_ACCESS_SIZE;  }
+
+        virtual void output(std::ostream & os) const override {
+            os << "InstrAddrAdapterForPin: trace entry: type: [" << _te.type << "] size: [" << _te.size << "]";
+        }
+
+    private:
+        trace_entry_t _te;
+    };
+
+    class MemPatternsForPin : public MemPatterns<MEMORY_ACCESS_SIZE>
+    {
+    public:
+        MemPatternsForPin() : _metrics(GATHER, SCATTER),
+                              _iinfo(GATHER, SCATTER) { }
+        virtual ~MemPatternsForPin() override { }
+
+        void handle_trace_entry(const InstrAddrAdapter & ia) override;
+        void generate_patterns() override;
+
+        Metrics &     get_metrics(mem_access_type) override;
+        InstrInfo &   get_iinfo(mem_access_type) override;
+
+        Metrics &     get_gather_metrics() override        { return _metrics.first;  }
+        Metrics &     get_scatter_metrics() override       { return _metrics.second; }
+        InstrInfo &   get_gather_iinfo () override         { return _iinfo.first;    }
+        InstrInfo &   get_scatter_iinfo () override        { return _iinfo.second;   }
+        TraceInfo &   get_trace_info() override            { return _trace_info;     }
+        InstrWindow<MEMORY_ACCESS_SIZE> &
+                      get_instr_window() override          { return _iw;             }
+
+        void          set_log_level(int8_t level) override { _log_level = level;     }
+        int8_t        get_log_level() override             { return _log_level;      }
+
+        void set_trace_file(const std::string & trace_file_name) { _trace_file_name = trace_file_name; }
+        const std::string & get_trace_file_name()          { return _trace_file_name; }
+
+        void set_binary_file(const std::string & binary_file_name) { _binary_file_name = binary_file_name; }
+        const std::string & get_binary_file_name()         { return _binary_file_name; }
+
+        void update_metrics();
+
+        std::string get_file_prefix ();
+
+        void process_traces();
+        void update_source_lines();
+        double update_source_lines_from_binary(mem_access_type);
+        void process_second_pass(gzFile & fp_drtrace);
+
+    private:
+        std::pair<Metrics, Metrics>     _metrics;
+        std::pair<InstrInfo, InstrInfo> _iinfo;
+        TraceInfo                       _trace_info;
+        InstrWindow<MEMORY_ACCESS_SIZE> _iw;
+
+        int8_t                          _log_level         = 0;
+
+        std::string                     _trace_file_name;
+        std::string                     _binary_file_name;
+    };
+
+} // namespace gspin_patterns
+
+} // namespace gs_patterns
diff --git a/nvbit_tracing/README.md b/nvbit_tracing/README.md
new file mode 100644
index 0000000..b12303c
--- /dev/null
+++ b/nvbit_tracing/README.md
@@ -0,0 +1,117 @@
+# Setup
+Download NVBit from the following locations:
+
+https://github.com/NVlabs/NVBit
+
+#### Tested with version 1.7
+
+https://github.com/NVlabs/NVBit/releases/tag/1.7
+
+#### From the parent directory of the gs_patterns distribution
+
+```
+# For example for Linux x86_64)
+
+wget https://github.com/NVlabs/NVBit/releases/download/1.7/nvbit-Linux-aarch64-1.7.tar.bz2
+```
+
+
+```
+module load gcc #or make sure you have gcc. Tested with 8.5.0 and 11.4.0
+
+tar xvf <nvbit-$platform-1.7.tar.bz2>
+
+export NVBIT_DIR=</location/of/nvbit_release/>  # full path
+
+cp -rv gs_patterns/nvbit_tracing/gsnv_trace $NVBIT_DIR/tools/
+
+cd $NVBIT_DIR
+
+#Compile tools and test apps. Make sure the gsnv_trace tool compiled. If successful will produced $NVBIT_DIR/tools/gsnv_trace/gsnv_trace.so
+make -j
+```
+
+
+*** <b>NOTE</b> *** make sure you gzip the nvbit trace output file before attempting to use with gs_patterns.
+
+# gsnv_trace
+
+The gsnv_trace tool will instrument one or more CUDA kernels within a CUDA application and pass the resulting memory traces to the gs_patterns gs_patterns_core library.  
+Once the application has completed and all kernels are retired the gs_patterns_core library will begin processing the trace data and automatically generate the pattern outputs and pattern output files.  
+This includes the JSON file containing Gather/Scatter Patterns.
+
+###  configuration
+gsnv_trace tool can be configured by setting the GSNV_CONFIG_FILE environment variable to a config file.  
+The config file should have 1 configuration setting per line.  Configuration settings take the form "<CONFIG_ITEM> <CONFIG_VALUE>" where there is a space between the config item and its value.
+
+The following are a list of configuration items currently supported:
+
+| Config               | Description                                                                                                                                                                                                                | possible values                |
+|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------|
+| GSNV_LOG_LEVEL       | Sets the log level (only 0-2 are currently supported)                                                                                                                                                                      | 0 to 255                       |
+| GSNV_TARGET_KERNEL   | Specifies the names of Kernels which will be instrumented seperated by space, it none is provided all Kernels will be instrumented. If no exact match found, Will match all kernels which starts with the string provided. | A String                       |
+| GSNV_FILE_PREFIX     | Can be used if specify the prefix of output files e.g if prefix is "trace_file" then output files will be names trace_file.json, etc. If none is provided one will be inferred from the input trace file if provided.      | A String                       |
+| GSNV_TRACE_OUT_FILE  | Specifies the name of the output file which will be written with trace data. Trace file will not be written if this is not provided.                                                                                       | A String                       |
+| GSNV_MAX_TRACE_COUNT | Specifies the maximum number of memory traces which are processed, once this number of traces are seen instrumentation is disabled (Can be useful to produce a small trace file for testing)                               | An Integer e.g 1000000         |
+| GSNV_ONE_WARP_MODE   | Enable handling traces for a single warp (defaults to warp 0 if enabled). Analogous to trace of first thread in CPU mode.                                                                                                  | 1 (on) or 0 (off) the default) |
+
+
+
+Example:
+
+```
+echo "GSNV_LOG_LEVEL 1" > ./gsnv_config.txt
+echo "GSNV_TRACE_OUT_FILE trace_file.nvbit.bin" >> ./gsnv_config.txt
+echo "GSNV_TARGET_KERNEL SweepUCBxyzKernel" >> ./gsnv_config.txt
+echo "GSNV_FILE_PREFIX trace_file" >> ./gsnv_config.txt
+
+export GSNV_CONFIG_FILE=./gsnv_config.txt
+```
+
+Additional settings which are supported by NVBit can also be set via additional environment variables.  To see these please visit the NVBit documentation.
+Setting covered here are specific to the gsnv_trace tool.
+
+<b>NOTE</b>: It is highly recommended to specify a target kernel using GSNV_TARGET_KERNEL as this allows the tool to be used more efficiently also results in smaller trace files.
+
+### Instrumenting an application
+
+To start instrumenting a CUDA application using gsnv_trace.  The gsnv_trace.so libary previously built will need to be specified using LD_PRELOAD. 
+
+Example:
+
+```
+LD_PRELOAD=$NVBIT_DIR/tools/gsnv_trace/gsnv_trace.so <application> <application options>
+gzip trace_file.nvbit.bin
+```
+
+This will load gsnv_trace.so and then execute the specified application. NVBit will instrument the application using gsnv_trace.so which will call into libgs_patterns_core to write out the resulting trace file and generate memory patterns withn the trace.
+The gzip command will compress the resulting trace file for use by gs_patterns in a subsequent run.
+
+### Generating Memory Patterns using an existing trace file.
+
+In the previous section on Instrumenting an application, we used gsnv_trace.so to instrument an application, the resulting trace file was then compressed.
+The instrumentation run also generated pattern files. 
+If we want to rerun the pattern generation we can do so using the generated (and compressed) trace file without re-instrumenting the application as this is much faster.
+To do this we just need to run the gs_pattern binary with the trace file and the "-nv " option.  The "-nv" option indicates that the trace file is a NVBit trace.  
+
+Example:
+
+```
+export GS_PATTERNS_DIR=/path/to/gs_patterns/binary/
+$GS_PATTERNS_DIR/gs_patterns <trace_file.nvbit.bin.gz> -nv
+```
+
+### Important Notes 
+
+This version of gsnv_trace works with NVBit >= 1.7 
+
+Example:
+
+```
+export LD_LIBARY_PATH=/path/to/new/cuda/12.3/lib:$LD_LIBRARY_PATH
+export PATH=/path/to/new/cuda/12.3/bin:$PATH
+
+# Point to where you built gsnv_trace.so and invoke the application with its command line arguments
+LD_PRELOAD=$NVBIT_DIR/tools/gsnv_trace/gsnv_trace.so <application> <application arguments>
+gzip trace_file.nvbit.bin
+```
diff --git a/nvbit_tracing/gsnv_trace/Makefile b/nvbit_tracing/gsnv_trace/Makefile
new file mode 100644
index 0000000..1a6a9ec
--- /dev/null
+++ b/nvbit_tracing/gsnv_trace/Makefile
@@ -0,0 +1,51 @@
+NVCC=nvcc -ccbin=$(CXX) -D_FORCE_INLINES
+PTXAS=ptxas
+
+NVCC_VER_REQ=10.1
+NVCC_VER=$(shell $(NVCC) --version | grep release | cut -f2 -d, | cut -f3 -d' ')
+NVCC_VER_CHECK=$(shell echo "${NVCC_VER} >= $(NVCC_VER_REQ)" | bc)
+
+ifeq ($(NVCC_VER_CHECK),0)
+$(error ERROR: nvcc version >= $(NVCC_VER_REQ) required to compile an nvbit tool! Instrumented applications can still use lower versions of nvcc.)
+endif
+
+PTXAS_VER_ADD_FLAG=12.3
+PTXAS_VER=$(shell $(PTXAS) --version | grep release | cut -f2 -d, | cut -f3 -d' ')
+PTXAS_VER_CHECK=$(shell echo "${PTXAS_VER} >= $(PTXAS_VER_ADD_FLAG)" | bc)
+
+ifeq ($(PTXAS_VER_CHECK), 0)
+MAXRREGCOUNT_FLAG=-maxrregcount=24
+else
+MAXRREGCOUNT_FLAG=
+endif
+
+NVBIT_PATH=../../core
+GSPATTERNS_CORE_PATH=../../../gs_patterns
+INCLUDES=-I$(NVBIT_PATH) -I$(GSPATTERNS_CORE_PATH)
+
+LIBS=-L$(NVBIT_PATH) -lnvbit -L$(GSPATTERNS_CORE_PATH)/build -lgs_patterns_core
+NVCC_PATH=-L $(subst bin/nvcc,lib64,$(shell which nvcc | tr -s /))
+
+SOURCES=$(wildcard *.cu)
+
+OBJECTS=$(SOURCES:.cu=.o)
+ARCH?=all
+
+mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
+current_dir := $(notdir $(patsubst %/,%,$(dir $(mkfile_path))))
+
+NVBIT_TOOL=$(current_dir).so
+
+all: $(NVBIT_TOOL)
+
+$(NVBIT_TOOL): $(OBJECTS) $(NVBIT_PATH)/libnvbit.a
+	$(NVCC) -arch=$(ARCH) -O3 $(OBJECTS) $(LIBS) $(NVCC_PATH) -lcuda -lcudart_static -shared -o $@
+
+%.o: %.cu
+	$(NVCC) -dc -c -std=c++17 $(INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall -arch=$(ARCH) -O3 -Xcompiler -fPIC $< -o $@
+
+inject_funcs.o: inject_funcs.cu
+	$(NVCC) $(INCLUDES) $(MAXRREGCOUNT_FLAG) -Xptxas -astoolspatch --keep-device-functions -Xcompiler -Wall -arch=$(ARCH) -Xcompiler -fPIC -c $< -o $@
+
+clean:
+	rm -f *.so *.o
diff --git a/nvbit_tracing/gsnv_trace/common.h b/nvbit_tracing/gsnv_trace/common.h
new file mode 100644
index 0000000..cff31b7
--- /dev/null
+++ b/nvbit_tracing/gsnv_trace/common.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+/* information collected in the instrumentation function and passed
+ * on the channel from the GPU to the CPU */
+typedef struct {
+    uint64_t grid_launch_id;
+    int      cta_id_x;
+    int      cta_id_y;
+    int      cta_id_z;
+    int      warp_id;
+    int      opcode_id;
+    int      opcode_short_id;
+    int      is_load;
+    int      is_store;
+    int      size;
+    int      line_id;
+    uint64_t iaddr;
+    uint64_t addrs[32];
+} mem_access_t;
diff --git a/nvbit_tracing/gsnv_trace/gsnv_trace.cu b/nvbit_tracing/gsnv_trace/gsnv_trace.cu
new file mode 100644
index 0000000..40872d5
--- /dev/null
+++ b/nvbit_tracing/gsnv_trace/gsnv_trace.cu
@@ -0,0 +1,511 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <map>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+#include <unordered_map>
+
+/* every tool needs to include this once */
+#include "nvbit_tool.h"
+
+/* nvbit interface file */
+#include "nvbit.h"
+
+/* for channel */
+#include "utils/channel.hpp"
+
+/* contains definition of the mem_access_t structure */
+#include "common.h"
+
+#include <gs_patterns.h>
+#include <gs_patterns_core.h>
+#include <gsnv_patterns.h>
+
+using namespace gs_patterns;
+using namespace gs_patterns::gs_patterns_core;
+using namespace gs_patterns::gsnv_patterns;
+
+#define HEX(x)                                                            \
+    "0x" << std::setfill('0') << std::setw(16) << std::hex << (uint64_t)x \
+         << std::dec
+
+#define CHANNEL_SIZE (1l << 20)
+
+struct CTXstate {
+    /* context id */
+    int id;
+
+    /* Channel used to communicate from GPU to CPU receiving thread */
+    ChannelDev* channel_dev;
+    ChannelHost channel_host;
+
+    volatile bool recv_thread_done = false;
+};
+
+/* lock */
+pthread_mutex_t mutex;
+pthread_mutex_t cuda_event_mutex;
+
+/* map to store context state */
+std::unordered_map<CUcontext, CTXstate*> ctx_state_map;
+
+/* skip flag used to avoid re-entry on the nvbit_callback when issuing
+ * flush_channel kernel call */
+bool skip_callback_flag = false;
+
+/* global control variables for this tool */
+uint32_t instr_begin_interval = 0;
+uint32_t instr_end_interval = UINT32_MAX;
+int verbose = 0;
+std::string gsnv_config_file;
+
+/* opcode to id map and reverse map  */
+std::map<std::string, int> opcode_to_id_map;
+std::map<int, std::string> id_to_opcode_map;
+std::map<std::string, int> opcode_short_to_id_map;
+std::map<std::string, int> line_to_id_map;
+
+// Instantiate GSPatterns for NVBit
+std::unique_ptr<MemPatternsForNV> mp(new MemPatternsForNV);
+
+
+/* grid launch id, incremented at every launch */
+uint64_t grid_launch_id = 0;
+
+void* recv_thread_fun(void* args);
+
+void nvbit_at_init() {
+    setenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC", "1", 1);
+    GET_VAR_INT(
+        instr_begin_interval, "INSTR_BEGIN", 0,
+        "Beginning of the instruction interval where to apply instrumentation");
+    GET_VAR_INT(
+        instr_end_interval, "INSTR_END", UINT32_MAX,
+        "End of the instruction interval where to apply instrumentation");
+    GET_VAR_INT(verbose, "TOOL_VERBOSE", 0, "Enable verbosity inside the tool");
+
+    GET_VAR_STR(gsnv_config_file, "GSNV_CONFIG_FILE", "Specify a GSNV config file");
+
+    std::string pad(100, '-');
+    printf("%s\n", pad.c_str());
+
+    /* set mutex as recursive */
+    pthread_mutexattr_t attr;
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+    pthread_mutex_init(&mutex, &attr);
+
+    pthread_mutex_init(&cuda_event_mutex, &attr);
+}
+
+/* Set used to avoid re-instrumenting the same functions multiple times */
+std::unordered_set<CUfunction> already_instrumented;
+
+void instrument_function_if_needed(CUcontext ctx, CUfunction func) {
+    assert(ctx_state_map.find(ctx) != ctx_state_map.end());
+    CTXstate* ctx_state = ctx_state_map[ctx];
+
+    /* Get related functions of the kernel (device function that can be
+     * called by the kernel) */
+    std::vector<CUfunction> related_functions =
+        nvbit_get_related_functions(ctx, func);
+
+    /* add kernel itself to the related function vector */
+    related_functions.push_back(func);
+
+    /* iterate on function */
+    for (auto f : related_functions) {
+        /* "recording" function was instrumented, if set insertion failed
+         * we have already encountered this function */
+        if (!already_instrumented.insert(f).second) {
+            continue;
+        }
+
+        /* get vector of instructions of function "f" */
+        const std::vector<Instr*>& instrs = nvbit_get_instrs(ctx, f);
+
+        if (verbose) {
+            printf(
+                "GSNV_TRACE: CTX %p, Inspecting CUfunction %p name %s at address "
+                "0x%lx\n",
+                ctx, f, nvbit_get_func_name(ctx, f), nvbit_get_func_addr(f));
+        }
+
+        // Get address of function PC
+        uint64_t func_addr = nvbit_get_func_addr(f);
+
+        uint32_t cnt = 0;
+        /* iterate on all the static instructions in the function */
+        for (auto instr : instrs) {
+            if (cnt < instr_begin_interval || cnt >= instr_end_interval ||
+                instr->getMemorySpace() == InstrType::MemorySpace::NONE ||
+                instr->getMemorySpace() == InstrType::MemorySpace::CONSTANT) {
+                cnt++;
+                continue;
+            }
+            if (verbose) {
+                instr->printDecoded();
+            }
+
+            // Opcode to OpCodeID
+            if (opcode_to_id_map.find(instr->getOpcode()) == opcode_to_id_map.end()) {
+                int opcode_id = opcode_to_id_map.size();
+                opcode_to_id_map[instr->getOpcode()] = opcode_id;
+                id_to_opcode_map[opcode_id] = std::string(instr->getOpcode());
+            }
+
+            int opcode_id = opcode_to_id_map[instr->getOpcode()];
+
+            // Opcode_Short to OpCode_Short_ID
+            if (opcode_short_to_id_map.find(instr->getOpcodeShort()) == opcode_short_to_id_map.end()) {
+                int opcode_short_id = opcode_short_to_id_map.size();
+                opcode_short_to_id_map[instr->getOpcodeShort()] = opcode_short_id;
+                //id_to_opcode_map[opcode_id] = std::string(instr->getOpcode());
+            }
+            int opcode_short_id = opcode_short_to_id_map[instr->getOpcodeShort()];
+
+            // Line to Line_ID
+            /* Get line information for a particular instruction offset if available, */
+            /* binary must be compiled with --generate-line-info   (-lineinfo) */
+            char * line_str;
+            char * dir_str;
+            uint32_t line_num;
+            bool status = nvbit_get_line_info(ctx, func, instr->getOffset(), &line_str, &dir_str, &line_num);
+
+            std::string line;
+            int line_id = -1;
+            if (status) {
+                std::stringstream ss;
+                ss << dir_str << line_str << ":" << line_num;
+                line = ss.str();
+
+                if (line_to_id_map.find(line) == line_to_id_map.end()) {
+                    line_id = line_to_id_map.size();
+                    line_to_id_map[line] = line_id;
+                }
+                line_id = line_to_id_map[line];
+                //std::cout << "Creating a mapping from: " << line << " to line_id: " << line_id << std::endl;
+            }
+
+            // Let MemPatternsForNV know about the mapping
+            mp->add_or_update_opcode(opcode_id, instr->getOpcode());
+            mp->add_or_update_opcode_short(opcode_short_id, instr->getOpcodeShort());
+            if (status) { mp->add_or_update_line(line_id, line); }
+
+            // Compute instruction address (function address + instruction offset)
+            uint64_t instr_addr = func_addr + instr->getOffset();
+
+            int mref_idx = 0;
+            /* iterate on the operands */
+            for (int i = 0; i < instr->getNumOperands(); i++) {
+                /* get the operand "i" */
+                const InstrType::operand_t* op = instr->getOperand(i);
+
+                if (op->type == InstrType::OperandType::MREF) {
+                    /* insert call to the instrumentation function with its
+                     * arguments */
+                    nvbit_insert_call(instr, "instrument_mem", IPOINT_BEFORE);
+                    /* predicate value */
+                    nvbit_add_call_arg_guard_pred_val(instr);
+                    /* opcode id */
+                    nvbit_add_call_arg_const_val32(instr, opcode_id);
+
+                    /* opcode short id */
+                    nvbit_add_call_arg_const_val32(instr, opcode_short_id);
+                    /* isLoad */
+                    nvbit_add_call_arg_const_val32(instr, instr->isLoad());
+                    /* isStore */
+                    nvbit_add_call_arg_const_val32(instr, instr->isStore());
+                    /* size */
+                    nvbit_add_call_arg_const_val32(instr, instr->getSize());
+                    /* line number id */
+                    nvbit_add_call_arg_const_val32(instr, line_id);
+
+                    /* Memory instruction address */
+                    nvbit_add_call_arg_const_val64(instr, instr_addr);
+
+                    /* memory reference 64 bit address */
+                    nvbit_add_call_arg_mref_addr64(instr, mref_idx);
+                    /* add "space" for kernel function pointer that will be set
+                     * at launch time (64 bit value at offset 0 of the dynamic
+                     * arguments)*/
+                    nvbit_add_call_arg_launch_val64(instr, 0);
+                    /* add pointer to channel_dev*/
+                    nvbit_add_call_arg_const_val64(
+                        instr, (uint64_t)ctx_state->channel_dev);
+                    mref_idx++;
+                }
+            }
+            cnt++;
+        }
+    }
+}
+
+/* flush channel */
+__global__ void flush_channel(ChannelDev* ch_dev) { ch_dev->flush(); }
+
+void init_context_state(CUcontext ctx) {
+    CTXstate* ctx_state = ctx_state_map[ctx];
+    ctx_state->recv_thread_done = false;
+    cudaMallocManaged(&ctx_state->channel_dev, sizeof(ChannelDev));
+    ctx_state->channel_host.init((int)ctx_state_map.size() - 1, CHANNEL_SIZE,
+                                 ctx_state->channel_dev, recv_thread_fun, ctx);
+    nvbit_set_tool_pthread(ctx_state->channel_host.get_thread());
+}
+
+void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid,
+                         const char* name, void* params, CUresult* pStatus) {
+    pthread_mutex_lock(&cuda_event_mutex);
+
+    /* we prevent re-entry on this callback when issuing CUDA functions inside
+     * this function */
+    if (skip_callback_flag) {
+        pthread_mutex_unlock(&cuda_event_mutex);
+        return;
+    }
+    skip_callback_flag = true;
+
+    if (cbid == API_CUDA_cuLaunchKernel_ptsz ||
+        cbid == API_CUDA_cuLaunchKernel ||
+        cbid == API_CUDA_cuLaunchCooperativeKernel ||
+        cbid == API_CUDA_cuLaunchCooperativeKernel_ptsz ||
+        cbid == API_CUDA_cuLaunchKernelEx ||
+        cbid == API_CUDA_cuLaunchKernelEx_ptsz) {
+        CTXstate* ctx_state = ctx_state_map[ctx];
+
+        CUfunction func;
+        if (cbid == API_CUDA_cuLaunchKernelEx_ptsz ||
+            cbid == API_CUDA_cuLaunchKernelEx) {
+            cuLaunchKernelEx_params* p = (cuLaunchKernelEx_params*)params;
+            func = p->f;
+        } else {
+            cuLaunchKernel_params* p = (cuLaunchKernel_params*)params;
+            func = p->f;
+        }
+
+        if (!is_exit && mp->should_instrument(nvbit_get_func_name(ctx, func)))
+        {
+            /* Make sure GPU is idle */
+            cudaDeviceSynchronize();
+            assert(cudaGetLastError() == cudaSuccess);
+
+            /* instrument */
+            instrument_function_if_needed(ctx, func);
+
+            int nregs = 0;
+            CUDA_SAFECALL(
+                cuFuncGetAttribute(&nregs, CU_FUNC_ATTRIBUTE_NUM_REGS, func));
+
+            int shmem_static_nbytes = 0;
+            CUDA_SAFECALL(
+                cuFuncGetAttribute(&shmem_static_nbytes,
+                                   CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, func));
+
+            /* get function name and pc */
+            const char* func_name = nvbit_get_func_name(ctx, func);
+            uint64_t pc = nvbit_get_func_addr(func);
+
+            /* set grid launch id at launch time */
+            nvbit_set_at_launch(ctx, func, (uint64_t)&grid_launch_id);
+
+            /* enable instrumented code to run */
+            nvbit_enable_instrumented(ctx, func, true);
+
+            if (cbid == API_CUDA_cuLaunchKernelEx_ptsz ||
+                cbid == API_CUDA_cuLaunchKernelEx)
+            {
+                cuLaunchKernelEx_params *p = (cuLaunchKernelEx_params *) params;
+                printf(
+                    "GSNV_TRACE: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - "
+                    "Kernel name %s - grid launch id %ld - grid size %d,%d,%d "
+                    "- block size %d,%d,%d - nregs %d - shmem %d - cuda stream "
+                    "id %ld\n",
+                    (uint64_t)ctx, pc, func_name, grid_launch_id,
+                    p->config->gridDimX, p->config->gridDimY,
+                    p->config->gridDimZ, p->config->blockDimX,
+                    p->config->blockDimY, p->config->blockDimZ, nregs,
+                    shmem_static_nbytes + p->config->sharedMemBytes,
+                    (uint64_t)p->config->hStream);
+            }
+            else
+            {
+                cuLaunchKernel_params* p = (cuLaunchKernel_params*)params;
+                printf(
+                    "GSNV_TRACE: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - "
+                    "Kernel name %s - grid launch id %ld - grid size %d,%d,%d "
+                    "- block size %d,%d,%d - nregs %d - shmem %d - cuda stream "
+                    "id %ld\n",
+                    (uint64_t)ctx, pc, func_name, grid_launch_id, p->gridDimX,
+                    p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY,
+                    p->blockDimZ, nregs,
+                    shmem_static_nbytes + p->sharedMemBytes,
+                    (uint64_t)p->hStream);
+            }
+
+        }
+        else
+        {
+            // make sure user kernel finishes to avoid deadlock
+            cudaDeviceSynchronize();
+            /* push a flush channel kernel */
+            flush_channel<<<1, 1>>>(ctx_state->channel_dev);
+
+            /* Make sure GPU is idle */
+            cudaDeviceSynchronize();
+            assert(cudaGetLastError() == cudaSuccess);
+
+            /* increment grid launch id for next launch */
+            grid_launch_id++;
+        }
+    }
+    skip_callback_flag = false;
+    pthread_mutex_unlock(&cuda_event_mutex);
+}
+
+void* recv_thread_fun(void* args) {
+    CUcontext ctx = (CUcontext)args;
+
+    pthread_mutex_lock(&mutex);
+    /* get context state from map */
+    assert(ctx_state_map.find(ctx) != ctx_state_map.end());
+    CTXstate* ctx_state = ctx_state_map[ctx];
+
+    ChannelHost* ch_host = &ctx_state->channel_host;
+    pthread_mutex_unlock(&mutex);
+    char* recv_buffer = (char*)malloc(CHANNEL_SIZE);
+
+    while (!ctx_state->recv_thread_done) {
+        /* receive buffer from channel */
+        uint32_t num_recv_bytes = ch_host->recv(recv_buffer, CHANNEL_SIZE);
+        if (num_recv_bytes > 0) {
+            uint32_t num_processed_bytes = 0;
+            while (num_processed_bytes < num_recv_bytes) {
+                mem_access_t* ma =
+                    (mem_access_t*)&recv_buffer[num_processed_bytes];
+
+#if 0
+                std::stringstream ss;
+                ss << "CTX " << HEX(ctx) << " - grid_launch_id "
+                   << ma->grid_launch_id << " - CTA " << ma->cta_id_x << ","
+                   << ma->cta_id_y << "," << ma->cta_id_z << " - warp "
+                   << ma->warp_id << " - " << id_to_opcode_map[ma->opcode_id]
+                   << " - iaddr " << HEX(ma->iaddr) << " - ";
+
+                for (int i = 0; i < 32; i++) {
+                    ss << HEX(ma->addrs[i]) << " ";
+                }
+
+                printf("GSNV_TRACE: %s\n", ss.str().c_str());
+#endif
+                num_processed_bytes += sizeof(mem_access_t);
+
+                try
+                {
+                    // Handle trace update here
+                    mp->handle_cta_memory_access(ma);
+                }
+                catch (const std::exception & ex)
+                {
+                    std::cerr << "ERROR: " << ex.what() << std::endl;
+                }
+            }
+        }
+    }
+    ctx_state->recv_thread_done = false;
+    free(recv_buffer);
+    return NULL;
+}
+
+void nvbit_at_ctx_init(CUcontext ctx) {
+    pthread_mutex_lock(&mutex);
+    //if (verbose) {
+    if (1) {
+        printf("GSNV_TRACE: STARTING CONTEXT %p\n", ctx);
+    }
+    assert(ctx_state_map.find(ctx) == ctx_state_map.end());
+    CTXstate* ctx_state = new CTXstate;
+    ctx_state_map[ctx] = ctx_state;
+    pthread_mutex_unlock(&mutex);
+
+    // -- init #2 - whats the difference
+    try {
+        if (!gsnv_config_file.empty()) {
+            mp->set_config_file(gsnv_config_file);
+        }
+    }
+    catch (const std::exception & ex) {
+        std::cerr << "ERROR: " << ex.what() << std::endl;
+    }
+}
+
+void nvbit_tool_init(CUcontext ctx) {
+    pthread_mutex_lock(&mutex);
+    assert(ctx_state_map.find(ctx) != ctx_state_map.end());
+    init_context_state(ctx);
+    pthread_mutex_unlock(&mutex);
+}
+
+void nvbit_at_ctx_term(CUcontext ctx) {
+    pthread_mutex_lock(&mutex);
+    skip_callback_flag = true;
+    //if (verbose) {
+    if (1) {
+        printf("GSNV_TRACE: TERMINATING CONTEXT %p\n", ctx);
+    }
+    /* get context state from map */
+    assert(ctx_state_map.find(ctx) != ctx_state_map.end());
+    CTXstate* ctx_state = ctx_state_map[ctx];
+
+    /* Notify receiver thread and wait for receiver thread to
+     * notify back */
+    ctx_state->recv_thread_done = true;
+    while (!ctx_state->recv_thread_done)
+        ;
+
+    ctx_state->channel_host.destroy(false);
+    cudaFree(ctx_state->channel_dev);
+    skip_callback_flag = false;
+    delete ctx_state;
+    pthread_mutex_unlock(&mutex);
+
+    try
+    {
+        // Generate GS Pattern output fle
+        mp->generate_patterns();
+    }
+    catch (const std::exception & ex)
+    {
+        std::cerr << "ERROR: " << ex.what() << std::endl;
+    }
+}
diff --git a/nvbit_tracing/gsnv_trace/inject_funcs.cu b/nvbit_tracing/gsnv_trace/inject_funcs.cu
new file mode 100644
index 0000000..8998ce3
--- /dev/null
+++ b/nvbit_tracing/gsnv_trace/inject_funcs.cu
@@ -0,0 +1,85 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "utils/utils.h"
+
+/* for channel */
+#include "utils/channel.hpp"
+
+/* contains definition of the mem_access_t structure */
+#include "common.h"
+
+extern "C" __device__ __noinline__ void instrument_mem(int pred,
+                                                       int opcode_id,
+                                                       int opcode_short_id,
+                                                       int is_load,
+                                                       int is_store,
+                                                       int size,
+                                                       int line_id,
+                                                       uint64_t iaddr,
+                                                       uint64_t addr,
+                                                       uint64_t grid_launch_id,
+                                                       uint64_t pchannel_dev) {
+    /* if thread is predicated off, return */
+    if (!pred) {
+        return;
+    }
+
+    int active_mask = __ballot_sync(__activemask(), 1);
+    const int laneid = get_laneid();
+    const int first_laneid = __ffs(active_mask) - 1;
+
+    mem_access_t ma;
+
+    /* collect memory address information from other threads */
+    for (int i = 0; i < 32; i++) {
+        ma.addrs[i] = __shfl_sync(active_mask, addr, i);
+    }
+
+    int4 cta = get_ctaid();
+    ma.grid_launch_id = grid_launch_id;
+    ma.cta_id_x = cta.x;
+    ma.cta_id_y = cta.y;
+    ma.cta_id_z = cta.z;
+    ma.warp_id = get_warpid();
+    ma.opcode_id = opcode_id;
+    ma.opcode_short_id = opcode_short_id;
+    ma.is_load = is_load;
+    ma.is_store = is_store;
+    ma.size = size;
+    ma.line_id = line_id;
+    ma.iaddr = iaddr;
+
+    /* first active lane pushes information on the channel */
+    if (first_laneid == laneid) {
+        ChannelDev* channel_dev = (ChannelDev*)pchannel_dev;
+        channel_dev->push(&ma, sizeof(mem_access_t));
+    }
+}
diff --git a/utils.cpp b/utils.cpp
new file mode 100644
index 0000000..adacb47
--- /dev/null
+++ b/utils.cpp
@@ -0,0 +1,126 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdexcept>
+
+#include "utils.h"
+
+namespace gs_patterns
+{
+namespace gs_patterns_core
+{
+
+static inline int popcount(uint64_t x) {
+    int c;
+
+    for (c = 0; x != 0; x >>= 1)
+        if (x & 1)
+            c++;
+    return c;
+}
+
+//string tools
+int startswith(const char* a, const char* b) {
+    if (strncmp(b, a, strlen(b)) == 0)
+        return 1;
+    return 0;
+}
+
+int endswith(const char* a, const char* b) {
+    int idx = strlen(a);
+    int preidx = strlen(b);
+
+    if (preidx >= idx)
+        return 0;
+    if (strncmp(b, &a[idx - preidx], preidx) == 0)
+        return 1;
+    return 0;
+}
+
+//https://stackoverflow.com/questions/779875/what-function-is-to-replace-a-substring-from-a-string-in-c
+const char* str_replace(const char* orig, const char* rep, const char* with) {
+    char* result; // the return string
+    char* ins;    // the next insert point
+    char* tmp;    // varies
+    int len_rep;  // length of rep (the string to remove)
+    int len_with; // length of with (the string to replace rep with)
+    int len_front; // distance between rep and end of last rep
+    int count;    // number of replacements
+
+    // sanity checks and initialization
+    if (!orig)
+        return NULL;
+
+    if (!rep)
+        return orig;
+
+    len_rep = strlen(rep);
+    if (len_rep == 0)
+        return NULL; // empty rep causes infinite loop during count
+    if (!with)
+        with = "";
+    len_with = strlen(with);
+
+    // count the number of replacements needed
+    ins = (char*)orig;
+    for (count = 0; (tmp = strstr(ins, rep)); ++count) {
+        ins = tmp + len_rep;
+    }
+
+    tmp = result = (char*)malloc(strlen(orig) + (len_with - len_rep) * count + 1);
+
+    if (!result)
+        return NULL;
+
+    while (count--) {
+        ins = (char*)strstr(orig, rep);
+        len_front = ins - orig;
+        tmp = strncpy(tmp, orig, len_front) + len_front;
+        tmp = strcpy(tmp, with) + len_with;
+        orig += len_front + len_rep; // move to next "end of rep"
+    }
+    strcpy(tmp, orig);
+    return result;
+}
+
+char* get_str(char* line, char* bparse, char* aparse) {
+
+    char* sline;
+
+    sline = (char*)str_replace(line, bparse, "");
+    sline = (char*)str_replace(sline, aparse, "");
+
+    return sline;
+}
+
+int cnt_str(char* line, char c) {
+
+    int cnt = 0;
+    for (int i = 0; line[i] != '\0'; i++) {
+        if (line[i] == c)
+            cnt++;
+    }
+
+    return cnt;
+}
+
+gzFile open_trace_file(const std::string & trace_file_name)
+{
+    gzFile fp;
+
+    fp = gzopen(trace_file_name.c_str(), "hrb");
+    if (NULL == fp) {
+        throw std::runtime_error("Could not open " + trace_file_name + "!");
+    }
+    return fp;
+}
+
+void close_trace_file (gzFile & fp)
+{
+    gzclose(fp);
+}
+
+} // gs_patterns_core
+
+} // gs_patterns
diff --git a/utils.h b/utils.h
new file mode 100644
index 0000000..644bfbf
--- /dev/null
+++ b/utils.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <zlib.h>
+
+namespace gs_patterns
+{
+namespace gs_patterns_core
+{
+
+int startswith(const char* a, const char* b);
+
+int endswith(const char* a, const char* b);
+
+const char* str_replace(const char* orig, const char* rep, const char* with);
+
+char* get_str(char* line, char* bparse, char* aparse);
+
+int cnt_str(char* line, char c);
+
+gzFile open_trace_file(const std::string & trace_file_name);
+
+void close_trace_file (gzFile & fp);
+
+} // namespace gs_gs_patterns_core
+
+} // namespace gs_patterns
\ No newline at end of file