How, when and what to vectorize in python?

That's right, so this is basically a continuation of my previous question. I have some binary data that are in binary floating point format. Using C, the process is fast , but I'm losing some precision with atof (). I tried to browse the forum, as well as elsewhere, but my problem was not resolved. So I switched to python. Ah, joy! the program works fine, but is so slow compared to C. I was looking for python optimizations that pointed to Cython and Weave, but I have some doubts. If you follow my code, I am confused where to apply the optimizing C code since I am reading from a numpy object. My question is: is it possible to read data using numpy functions in Cython, and if so, provide a small example.

C code uses PolSARpro header files, and libbmp to create a .bmp file

As a note, I am sending both of my codes. God knows that I had to go through a lot to get the formulas to work. That way, others who need it can also give their thoughts and input :)

C Code (it works, but atof () loses accuracy, so the output of lat long is a bit off)


#include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h> #include <polSARpro/bmpfile.c> #include <polSARpro/graphics.c> #include <polSARpro/matrix.c> #include <polSARpro/processing.c> #include <polSARpro/util.c> #define METAL_THRESHOLD 5.000000 #define POLARIZATION_FRACTION_THRESHOLD 0.900000 #define PI 3.14159265 #define FOURTHPI PI/4 #define deg2rad PI/180 #define rad2deg 180./PI /*double PI = 3.14159265; double FOURTHPI = PI / 4; double deg2rad = PI / 180; double rad2deg = 180.0 / PI;*/ FILE *L1,*PF,*SPF; FILE *txt; FILE *finalLocations; long i=0,loop_end; int lig,col; float l1,pf,spf; long pos; int Nlig,Ncol; float *bufferout; float *bufferin_L1,*bufferin_L2; float valueL1,valuePF,xx; float sizeGridX, sizeGridY, startX, startY; float posX,posY; int ZONE; char Heading[10]; char setZone[15]; int p[4][2]; int degree, minute, second; void UTM2LL(int ReferenceEllipsoid, double UTMNorthing, double UTMEasting, char* UTMZone, double *Lat, double *Long) { //converts UTM coords to lat/long. Equations from USGS Bulletin 1532 //East Longitudes are positive, West longitudes are negative. //North latitudes are positive, South latitudes are negative //Lat and Long are in decimal degrees. //Written by Chuck Gantz- chuck.gantz@globalstar.com double k0 = 0.9996; double a = 6378137; double eccSquared = 0.00669438; double eccPrimeSquared; double e1 = (1-sqrt(1-eccSquared))/(1+sqrt(1-eccSquared)); double N1, T1, C1, R1, D, M; double LongOrigin; double mu, phi1, phi1Rad; double x, y; int ZoneNumber; char* ZoneLetter; int NorthernHemisphere; //1 for northern hemispher, 0 for southern x = UTMEasting - 500000.0; //remove 500,000 meter offset for longitude y = UTMNorthing; ZoneNumber = strtoul(UTMZone, &ZoneLetter, 10); if((*ZoneLetter - 'N') >= 0) NorthernHemisphere = 1;//point is in northern hemisphere else { NorthernHemisphere = 0;//point is in southern hemisphere y -= 10000000.0;//remove 10,000,000 meter offset used for southern hemisphere } LongOrigin = (ZoneNumber - 1)*6 - 180 + 3; //+3 puts origin in middle of zone eccPrimeSquared = (eccSquared)/(1-eccSquared); M = y / k0; mu = M/(a*(1-eccSquared/4-3*eccSquared*eccSquared/64-5*eccSquared*eccSquared*eccSquared/256)); phi1Rad = mu + (3*e1/2-27*e1*e1*e1/32)*sin(2*mu) + (21*e1*e1/16-55*e1*e1*e1*e1/32)*sin(4*mu) +(151*e1*e1*e1/96)*sin(6*mu); phi1 = phi1Rad*rad2deg; N1 = a/sqrt(1-eccSquared*sin(phi1Rad)*sin(phi1Rad)); T1 = tan(phi1Rad)*tan(phi1Rad); C1 = eccPrimeSquared*cos(phi1Rad)*cos(phi1Rad); R1 = a*(1-eccSquared)/pow(1-eccSquared*sin(phi1Rad)*sin(phi1Rad), 1.5); D = x/(N1*k0); *Lat = phi1Rad - (N1*tan(phi1Rad)/R1)*(D*D/2-(5+3*T1+10*C1-4*C1*C1-9*eccPrimeSquared)*D*D*D*D/24 +(61+90*T1+298*C1+45*T1*T1-252*eccPrimeSquared-3*C1*C1)*D*D*D*D*D*D/720); *Lat = *Lat * rad2deg; *Long = (D-(1+2*T1+C1)*D*D*D/6+(5-2*C1+28*T1-3*C1*C1+8*eccPrimeSquared+24*T1*T1) *D*D*D*D*D/120)/cos(phi1Rad); *Long = LongOrigin + *Long * rad2deg; } void convertToDegree(float decimal) { int negative = decimal < 0; decimal = abs(decimal); minute = (decimal * 3600/ 60); second = fmodf((decimal * 3600),60); degree = minute / 60; minute = minute % 60; if (negative) { if (degree > 0) degree = -degree; else if (minute > 0) minute = -minute; else second = -second; } } void readConfig(int *Row, int *Col) { char tmp[70]; int i=0; FILE *fp = fopen("config.txt","r"); if(fp == NULL) { perror("Config.txt"); exit(1); } while(!feof(fp)) { fgets(tmp,70,fp); if (i==1) *Row = atoi(tmp); if(i==4) *Col = atoi(tmp); i++; } fclose(fp); } void readHDR(float *gridX,float *gridY,float *startXPos,float *startYPos) { FILE *fp = fopen("PF.bin.hdr","r"); int i=0; char tmp[255]; char junk[255]; memset(tmp,0X00,sizeof(tmp)); memset(junk,0X00,sizeof(junk)); if(fp==NULL) { perror("Please locate or create PF.bin.hdr"); exit(0); } while(!feof(fp)) { if(i==13) break; fgets(tmp,255,fp); i++; } fclose(fp); strcpy(junk,strtok(tmp,",")); strtok(NULL,","); strtok(NULL,","); strcpy(tmp,strtok(NULL,",")); //puts(tmp); *startXPos = atof(tmp); strcpy(tmp,strtok(NULL,",")); //puts(tmp); *startYPos = atof(tmp); strcpy(tmp,strtok(NULL,",")); //puts(tmp); *gridX = atof(tmp); strcpy(tmp,strtok(NULL,",")); //puts(tmp); *gridY = atof(tmp); strcpy(tmp,strtok(NULL,",")); ZONE = atoi(tmp); strcpy(tmp,strtok(NULL,",")); strcpy(Heading,tmp); } int main() { bmpfile_t *bmp; double Lat; double Long; int i; rgb_pixel_t pixelMetal = {128, 64, 0, 0}; rgb_pixel_t pixelOthers = {128, 64, 0, 0}; readConfig(&Nlig,&Ncol); readHDR(&sizeGridX,&sizeGridY,&startX,&startY); //startX = startX - (double) 0.012000; //startY = startY + (double)0.111000; printf("Enter the rectangle top-left and bottom-right region of interest points as: xy\n"); for(i=0;i<2;i++) { printf("Enter point %d::\t",i+1); scanf("%d %d",&p[i][0], &p[i][1]); } printf("Grid Size(X,Y)::( %f,%f ), Start Positions(X,Y)::( %f, %f ), ZONE::%d, Heading:: %s\n\n",sizeGridX,sizeGridY,startX,startY,ZONE,Heading); pixelMetal.red = 255; pixelMetal.blue = 010; pixelMetal.green = 010; pixelOthers.red = 8; pixelOthers.blue = 8; pixelOthers.green = 8; L1 = fopen("l1.bin","rb"); PF =fopen("PF.bin","rb"); SPF = fopen("SPF_L1.bin","wb"); //txt = fopen("locations(UTM).txt","w"); finalLocations = fopen("locationsROI.txt","w"); if(L1==NULL || PF==NULL || SPF==NULL || finalLocations == NULL) { perror("Error in opening files!"); return -1; } fseek(L1,0,SEEK_END); pos = ftell(L1); loop_end = pos; printf("L1.bin contains::\t%ld elements\n",pos); fseek(PF,0,SEEK_END); pos = ftell(PF); printf("PF.bin contains::\t%ld elements\n",pos); fseek(L1,0,SEEK_SET); fseek(PF,0,SEEK_SET); bmp = bmp_create(Ncol,Nlig,8); //width * height bufferin_L1 = vector_float(Ncol); bufferin_L2 = vector_float(Ncol); bufferout = vector_float(Ncol); printf("Resources Allocated. Beginning...\n"); for (lig = 0; lig < Nlig; lig++) /* rows */ { if (lig%(int)(Nlig/20) == 0) { printf("%f\r", 100. * lig / (Nlig - 1)); fflush(stdout); } fread(&bufferin_L1[0], sizeof(float), Ncol, L1); fread(&bufferin_L2[0], sizeof(float), Ncol, PF); for (col = 0; col < Ncol; col++) /* columns */ { valueL1 = bufferin_L1[col]; valuePF = bufferin_L2[col]; if(valueL1 >= METAL_THRESHOLD && valuePF >= POLARIZATION_FRACTION_THRESHOLD) { if(col >= p[0][0] && col <= p[1][0] && lig >= p[0][1] && lig <= p[1][1]) { xx = fabs(valueL1 + valuePF); bmp_set_pixel(bmp,col,lig,pixelMetal); posX = startX + (sizeGridX * col); posY = startY - (sizeGridY * lig); //fprintf(txt,"%f %f %d %s\n",posX,posY,ZONE,Heading); sprintf(setZone,"%d",ZONE); if(strstr(Heading,"Nor")!=NULL) strcat(setZone,"N"); else strcat(setZone,"S"); UTM2LL(23, posY, posX, setZone, &Lat, &Long); // 23 for WGS-84 convertToDegree(Lat); //fprintf(finalLocations,"UTM:: %.2fE %.2fN , Decimal: %f %f , Degree: %d %d %d, ",posX,posY,Lat,Long,degree,minute,second); //fprintf(finalLocations,"%.2fE,%.2fN,%f,%f ,%d,%d,%d,",posX,posY,Lat,Long,degree,minute,second); fprintf(finalLocations,"%.2f,%.2f,%f,%f ,%d,%d,%d,",posX,posY,Lat,Long,degree,minute,second); convertToDegree(Long); fprintf(finalLocations,"%d,%d,%d\n",degree,minute,second); } else { xx = fabs(valueL1) ; bmp_set_pixel(bmp,col,lig,pixelOthers); } } else { xx = fabs(valueL1) ; bmp_set_pixel(bmp,col,lig,pixelOthers); } bufferout[col] = xx; } fwrite(&bufferout[0], sizeof(float), Ncol, SPF); } free_vector_float(bufferout); fclose(L1); fclose(PF); fclose(SPF); //fclose(txt); fclose(finalLocations); printf("\n----------Writing BMP File!----------\n"); bmp_save(bmp,"SPF_L1(ROI).bmp"); bmp_destroy(bmp); printf("\nDone!\n"); } 

Just like the Python code ::


 # -*- coding: utf-8 -*- """ Created on Wed Apr 10 10:29:18 2013 @author: Binayaka """ import numpy as Num; import math; import array; class readConfiguration(object): def __init__(self,x): self.readConfig(x); def readConfig(self,x): try: crs = open(x,'r'); srs = open('config.txt','r'); except IOError: print "Files missing!"; else: rows = crs.readlines(); values = rows[12].split(','); rows = srs.readlines(); self.startX = float(values[3]); self.startY = float(values[4]); self.gridSizeX = float(values[5]); self.gridSizeY = float(values[6]); self.Zone = int(values[7]); self.Hemisphere = values[8]; self.NRows = int(rows[1].strip()); self.NCols = int(rows[4].strip()); self.MetalThreshold = 5.000000; self.PFThreshold = 0.900000; self.rad2deg = 180/math.pi; self.deg2rad = math.pi/180; self.FOURTHPI = math.pi/4; crs.close(); srs.close(); def decdeg2dms(dd): negative = dd < 0; dd = abs(dd); minutes,seconds = divmod(dd*3600,60); degrees,minutes = divmod(minutes,60); if negative: if degrees > 0: degrees = -degrees; elif minutes > 0: minutes = -minutes; else: seconds = -seconds; return (degrees,minutes,seconds); def UTM2LL(self,UTMEasting, UTMNorthing): k0 = 0.9996; a = 6378137; eccSquared = 0.00669438; e1 = (1-math.sqrt(1-eccSquared))/(1+math.sqrt(1-eccSquared)); x = UTMEasting - 500000.0;#remove 500,000 meter offset for longitude y = UTMNorthing; if self.Hemisphere == "North": self.Hemi = 1; else: self.Hemi = -1; y -= 10000000.0; LongOrigin = (self.Zone - 1)*6 - 180 + 3; eccPrimeSquared = (eccSquared)/(1-eccSquared); M = y / k0; mu = M/(a*(1-eccSquared/4-3*eccSquared*eccSquared/64-5*eccSquared*eccSquared*eccSquared/256)); phi1Rad = mu + (3*e1/2-27*e1*e1*e1/32)*math.sin(2*mu) + (21*e1*e1/16-55*e1*e1*e1*e1/32)*math.sin(4*mu) +(151*e1*e1*e1/96)*math.sin(6*mu); #phi1 = phi1Rad*self.rad2deg; N1 = a/math.sqrt(1-eccSquared*math.sin(phi1Rad)*math.sin(phi1Rad)); T1 = math.tan(phi1Rad)*math.tan(phi1Rad); C1 = eccPrimeSquared*math.cos(phi1Rad)*math.cos(phi1Rad); R1 = a*(1-eccSquared)/pow(1-eccSquared*math.sin(phi1Rad)*math.sin(phi1Rad), 1.5); D = x/(N1*k0); self.Lat = phi1Rad - (N1*math.tan(phi1Rad)/R1)*(D*D/2-(5+3*T1+10*C1-4*C1*C1-9*eccPrimeSquared)*D*D*D*D/24 +(61+90*T1+298*C1+45*T1*T1-252*eccPrimeSquared-3*C1*C1)*D*D*D*D*D*D/720); self.Lat = self.Lat * self.rad2deg; self.Long = (D-(1+2*T1+C1)*D*D*D/6+(5-2*C1+28*T1-3*C1*C1+8*eccPrimeSquared+24*T1*T1)*D*D*D*D*D/120)/math.cos(phi1Rad); self.Long = LongOrigin + self.Long * self.rad2deg; def printConfiguration(self): """ Just to check whether our reading was correct """ print "Metal Threshold:\t" + str(self.MetalThreshold); print "PF Threshold:\t" + str(self.PFThreshold); print "Start X:\t" + str(self.startX); print "Start Y:\t" + str(self.startY); print "Grid size(X) :\t" + str(self.gridSizeX); print "Grid size(Y) :\t" + str(self.gridSizeY); def createROIfile(self,ROIFilename): firstPoint = raw_input('Enter topLeft point coord\t').split(); secondPoint = raw_input('Enter bottomRight point coord\t').split(); try: L1 = open('l1.bin','rb'); PF = open('PF.bin','rb'); SPF = open('pySPF_L1.bin','wb'); targetFilename = open(ROIFilename,'w'); except IOError: print "Files Missing!"; else: L1.seek(0,2); elementsL1 = L1.tell(); L1.seek(0,0); PF.seek(0,2); elementsPF = PF.tell(); PF.seek(0,0); print "L1.bin contains\t" + str(elementsL1) + " elements"; print "PF.bin contains\t" + str(elementsPF) + " elements"; binvaluesL1 = array.array('f'); binvaluesPF = array.array('f'); binvaluesSPF = array.array('f'); for row in range(0,self.NRows): binvaluesL1.read(L1,self.NCols); binvaluesPF.read(PF,self.NCols); dataL1 = Num.array(binvaluesL1, dtype=Num.float); dataPF = Num.array(binvaluesPF, dtype=Num.float); dataSPF = dataL1 + dataPF; binvaluesSPF.fromlist(Num.array(dataSPF).tolist()); for col in range(0,self.NCols): if(dataL1[col] >= self.MetalThreshold and dataPF[col] >= self.PFThreshold): if(col >= int(firstPoint[0]) and col <= int(secondPoint[0]) and row >= int(firstPoint[1]) and row <= int(secondPoint[1])): posX = self.startX + (self.gridSizeX * col); posY = self.startY - (self.gridSizeY * row); self.UTM2LL(posY,posX); tmp1 = self.decdeg2dms(posY); tmp2 = self.decdeg2dms(posX); strTarget = "Decimal Degree:: " + str(posX) + "E " + str(posY) + "N \t Lat long:: " + str(tmp1) + " " + str(tmp2) + "\n"; targetFilename.write(strTarget); binvaluesSPF.tofile(SPF); L1.close(); PF.close(); SPF.close(); targetFilename.close(); print "Done!"; dimensions = readConfiguration('PF.bin.hdr'); dimensions.printConfiguration(); dimensions.createROIfile('testPythonROI.txt'); 

Its Python code, which needs to be optimized, since the values ​​of NRows and NCols can reach the order of thousands.

+4
source share
1 answer

A few general notes:

  • With python, it is best to stick with PEP8 for a variety of reasons. Python programmers are particularly picky about readability and, essentially, universally adhere to community coding guidelines (PEP8). Avoid camelCase, keep lines below 80 columns, leave semicolons, and feel free to sometimes ignore these guidelines where they will make things less readable.

  • There is no need for a built-in array type if you are using numpy. I am confused why you are constantly converting back and forth ...

  • Use the projection library. Indicate which date and ellipsoid you are using, otherwise the coordinates (east / north or lat / long) have absolutely no value.

  • Do not use one large class as a hold for unrelated things. There is nothing wrong with having only a few features. You do not need to do this in the classroom, if it makes sense to do it.

  • Use vectorized operations with numpy arrays.

Here's what your performance bottleneck might seem like:

  for row in range(0,self.NRows): binvaluesL1.read(L1,self.NCols); binvaluesPF.read(PF,self.NCols); dataL1 = Num.array(binvaluesL1, dtype=Num.float); dataPF = Num.array(binvaluesPF, dtype=Num.float); dataSPF = dataL1 + dataPF; binvaluesSPF.fromlist(Num.array(dataSPF).tolist()); for col in range(0,self.NCols): if(dataL1[col] >= self.MetalThreshold and dataPF[col] >= self.PFThreshold): if(col >= int(firstPoint[0]) and col <= int(secondPoint[0]) and row >= int(firstPoint[1]) and row <= int(secondPoint[1])): posX = self.startX + (self.gridSizeX * col); posY = self.startY - (self.gridSizeY * row); self.UTM2LL(posY,posX); tmp1 = self.decdeg2dms(posY); tmp2 = self.decdeg2dms(posX); strTarget = "Decimal Degree:: " + str(posX) + "E " + str(posY) + "N \t Lat long:: " + str(tmp1) + " " + str(tmp2) + "\n"; targetFilename.write(strTarget); binvaluesSPF.tofile(SPF); 

One of the biggest problems is how you read the data. You constantly read things as one, and then convert it to a list, and then convert it to a numpy array. There is absolutely no need to jump over all these hoops. Numpy will unpack your binary floats, as if array would be.

Just do grid = np.fromfile(yourfile, dtype=np.float32).reshape(ncols, nrows) . (Out of cycle.)

After that, your nested loops can be easily vectorized and expressed with just a few lines of code.

This is how I write your code. It probably won’t work as it is, since I cannot check it with your data. However, he should give you some general ideas.

 import numpy as np import pyproj def main(): config = Config('PF.bin.hdr') grid1, grid2 = load_data('l1.bin', 'PF.bin', config.nrows, config.ncols) spf = grid1 + grid2 spf.tofile('pySPF_L1.bin') easting_aoi, northing_aoi = subset_data(grid1, grid2, config) save_selected_region(easting_aoi, northing_aoi, config.zone, 'testPythonROI.txt') def load_data(filename1, filename2, nrows, ncols): """It would really be good to use more descriptive variable names than "L1" and "PF". I have no idea what L1 and PF are, so I'm just calling them grid1 and grid2.""" grid1 = np.fromfile(filename1, dtype=np.float32).reshape(nrows, ncols) grid2 = np.fromfile(filename2, dtype=np.float32).reshape(nrows, ncols) return grid1, grid2 def subset_data(grid1, grid2, config): """Select points that satisfy some threshold criteria (explain??) and are within a user-specified rectangular AOI.""" northing, easting = np.mgrid[:config.nrows, :config.ncols] easting = config.xstart + config.xgridsize * easting northing = config.ystart + config.ygridsize * northing grids = grid1, grid2, easting, northing grid1, grid2, easting, northing = [item[config.user_aoi] for item in grids] mask = (grid1 >= config.metal_threshold) & (grid2 >= config.pf_threshold) return easting[mask], northing[mask] def save_selected_region(easting, northing, zone, filename): """Convert the given eastings and northings (in UTM zone "zone") to lat/long and save to a tab-delimited-text file.""" lat, lon = utm2geographic(easting, northing, zone) data = np.vstack([easting, northing, lat, lon]).T with open(filename, 'w') as outfile: outfile.write('Easting\tNorthing\tLatitude\tLongitude\n') np.savetxt(outfile, data, delimiter='\t') def utm2geographic(easting, northing, zone): """We need to know which datum/ellipsoid the UTM coords are in as well!!!! I'm assuming it a Clark 1866 ellipsoid, based on the numbers in your code...""" utm = pyproj.Proj(proj='utm', zone=zone, ellip='clrk66') geographic = pyproj.Proj(proj='latlong', ellip='clrk66') return pyproj.transform(utm, geographic, easting, northing) class Config(object): """Read and store configuration values for (something?).""" config_file = 'config.txt' def __init__(self, filename): """You should add docstrings to clarify what you're expecting "filename" to contain.""" with open(filename, 'r') as infile: crs_values = list(infile)[12].split(',') crs_values = [float(item) for item in crs_values] self.xstart, self.ystart = crs_values[3:5] self.xgridsize, self.ygridsize = crs_values[5:7] self.zone = int(crs_values[7]) with open(self.config_file, 'r') as infile: srs_values = list(infile) self.nrows, self.ncols = srs_values[1], srs_values[4] # It would be good to explain a bit about these (say, units, etc) self.metal_threshold = 5.0 self.pf_threshold = 0.9 self.user_aoi = self.read_user_aoi() def read_user_aoi(self): """Get an area of interest of the grids in pixel coordinates.""" top_left = raw_input('Enter top left index\t') bottom_right = raw_input('Enter bottom right index\t') min_i, min_j = [int(item) for item in top_left.split()] max_i, max_j = [int(item) for item in bottom_right.split()] return slice(min_i, max_i), slice(min_j, max_j) if __name__ == '__main__': main() 
+3
source

All Articles