(I edited this for clarity and slightly modified the actual question based on the EOL answer) I'm trying to translate the following function in C to Python, but fail (see C code below). As I understand it, it takes four 1-byte characters, starting from the memory location pointed to by, treats them as unsigned long ints to give each of 4 bytes of space, and does some bit offsets to order them as big-endian 32 bit integer. It is then used in a file validation algorithm. (from the Treaty of Babylon )
static int32 read_alan_int(unsigned char *from) { return ((unsigned long int) from[3])| ((unsigned long int)from[2] << 8) | ((unsigned long int) from[1]<<16)| ((unsigned long int)from[0] << 24); } static int32 claim_story_file(void *story_file, int32 extent) { unsigned char *sf = (unsigned char *) story_file; int32 bf, i, crc=0; if (extent < 160) return INVALID_STORY_FILE_RV; if (memcmp(sf,"ALAN",4)) { bf=read_alan_int(sf+4); if (bf > extent/4) return INVALID_STORY_FILE_RV; for (i=24;i<81;i+=4) if (read_alan_int(sf+i) > extent/4) return INVALID_STORY_FILE_RV; for (i=160;i<(bf*4);i++) crc+=sf[i]; if (crc!=read_alan_int(sf+152)) return INVALID_STORY_FILE_RV; return VALID_STORY_FILE_RV; } else { bf=read_alan_int(sf+12); if (bf > (extent/4)) return INVALID_STORY_FILE_RV; for (i=184;i<(bf*4);i++) crc+=sf[i]; if (crc!=read_alan_int(sf+176)) return INVALID_STORY_FILE_RV; } return INVALID_STORY_FILE_RV; }
I am trying to override this in Python. To implement the read_alan_int function read_alan_int I would think that importing struct and executing struct.unpack_from('>L', data, offset) would work. However, in valid files, this always returns 24 for the value of bf , which means the for loop is skipped.
def read_alan_int(file_buffer, i): i0 = ord(file_buffer[i]) * (2 ** 24) i1 = ord(file_buffer[i + 1]) * (2 ** 16) i2 = ord(file_buffer[i + 2]) * (2 ** 8) i3 = ord(file_buffer[i + 3]) return i0 + i1 + i2 + i3 def is_a(file_buffer): crc = 0 if len(file_buffer) < 160: return False if file_buffer[0:4] == 'ALAN': # Identify Alan 2.x bf = read_alan_int(file_buffer, 4) if bf > len(file_buffer)/4: return False for i in range(24, 81, 4): if read_alan_int(file_buffer, i) > len(file_buffer)/4: return False for i in range(160, bf * 4): crc += ord(file_buffer[i]) if crc != read_alan_int(file_buffer, 152): return False return True else: # Identify Alan 3.x #bf = read_long(file_buffer, 12, '>') bf = read_alan_int(file_buffer, 12) print bf if bf > len(file_buffer)/4: return False for i in range(184, bf * 4): crc += ord(file_buffer[i]) if crc != read_alan_int(file_buffer, 176): return False return True return False if __name__ == '__main__': import sys, struct data = open(sys.argv[1], 'rb').read() print is_a(data)
... but damn still returns 24. Unfortunately, my C skills don't exist, so it's hard for me to get the source program to print some debugging output so that I can know what bf should be.
What am I doing wrong?
Ok, so I obviously read read_alan_int correctly. However, that for me is not a check that the first 4 characters are "ALAN". All my test files fail this test. I changed the code to remove this if / else statement and instead just take advantage of the early return, and now all my unit tests pass. So, on a practical level, I am done. However, I will keep the question open to solve a new problem: how can I possibly interrupt a bit to get "ALAN" from the first four characters?
def is_a(file_buffer): crc = 0 if len(file_buffer) < 160: return False #if file_buffer.startswith('ALAN'): # Identify Alan 2.x bf = read_long(file_buffer, 4) if bf > len(file_buffer)/4: return False for i in range(24, 81, 4): if read_long(file_buffer, i) > len(file_buffer)/4: return False for i in range(160, bf * 4): crc += ord(file_buffer[i]) if crc == read_long(file_buffer, 152): return True # Identify Alan 3.x crc = 0 bf = read_long(file_buffer, 12) if bf > len(file_buffer)/4: return False for i in range(184, bf * 4): crc += ord(file_buffer[i]) if crc == read_long(file_buffer, 176): return True return False