Using realloc to expand buffer when reading from crash files

I am writing code that should read fasta files, so part of my code (see below) is a fasta parser. Since a single sequence can span multiple lines in fasta format, I need to combine several consecutive lines read from a file into one line. I do this by rearranging the string buffer after reading each line to be the current length of the sequence, as well as the length of the line in which it is being read. I do some other things like removing white space, etc. Everything goes well for the first sequence, but fasta files can contain multiple sequences. Similarly, I have a dynamic array of structures with two lines (name and actual sequence), being "char *". Again, when I come across a new name (an input line starting with '>'),I am increasing the number of sequences and redistributing the sequence list buffer. Realloc segfaults when allocating space for a second sequence with

*** glibc detected *** ./stackoverflow: malloc(): memory corruption: 0x09fd9210 ***
Aborted

In life, I do not understand why. I ran it through gdb and everything seems to work (i.e., everything is initialized, the values ​​seem reasonable) ... Here is the code:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <math.h>
#include <errno.h>

//a struture to keep a record of sequences read in from file, and their titles
typedef struct {
    char *title;
    char *sequence;
} sequence_rec;

//string convenience functions

//checks whether a string consists entirely of white space
int empty(const char *s) {
    int i;
    i = 0;
    while (s[i] != 0) {
        if (!isspace(s[i])) return 0;
        i++;
    }
    return 1;
}

//substr allocates and returns a new string which is a substring of s from i to
//j exclusive, where i < j; If i or j are negative they refer to distance from
//the end of the s
char *substr(const char *s, int i, int j) {
    char *ret;
    if (i < 0) i = strlen(s)-i;
    if (j < 0) j = strlen(s)-j;
    ret = malloc(j-i+1);
    strncpy(ret,s,j-i);
    return ret;
}

//strips white space from either end of the string
void strip(char **s) {
    int i, j, len;
    char *tmp = *s;
    len = strlen(*s);
    i = 0;
    while ((isspace(*(*s+i)))&&(i < len)) {
        i++;
    }
    j = strlen(*s)-1;
    while ((isspace(*(*s+j)))&&(j > 0)) {
        j--;
    }
    *s = strndup(*s+i, j-i);
    free(tmp);
}


int main(int argc, char**argv) {
    sequence_rec *sequences = NULL;
    FILE *f = NULL;
    char *line = NULL;
    size_t linelen;
    int rcount;
    int numsequences = 0;

    f = fopen(argv[1], "r");
    if (f == NULL) {
        fprintf(stderr, "Error opening %s: %s\n", argv[1], strerror(errno));
        return EXIT_FAILURE;
    }
    rcount = getline(&line, &linelen, f);
    while (rcount != -1) {
        while (empty(line)) rcount = getline(&line, &linelen, f);
        if (line[0] != '>') {
            fprintf(stderr,"Sequence input not in valid fasta format\n");
            return EXIT_FAILURE;
        }

        numsequences++;
        sequences = realloc(sequences,sizeof(sequence_rec)*numsequences);
        sequences[numsequences-1].title = strdup(line+1); strip(&sequences[numsequences-1].title);
        rcount = getline(&line, &linelen, f);
        sequences[numsequences-1].sequence = malloc(1); sequences[numsequences-1].sequence[0] = 0;
        while ((!empty(line))&&(line[0] != '>')) {
            strip(&line);
            sequences[numsequences-1].sequence = realloc(sequences[numsequences-1].sequence, strlen(sequences[numsequences-1].sequence)+strlen(line)+1);
            strcat(sequences[numsequences-1].sequence,line);
            rcount = getline(&line, &linelen, f);
        }
    }
    return EXIT_SUCCESS;
}
+5
source share
3 answers

I think the memory corruption issue may be the result of how you process the data used in your calls getline(). Basically, it is lineredistributed through strndup()on calls strip(), so the buffer size tracked in linelenon getline()will no longer be accurate. getline()may intercept the buffer.

while ((!empty(line))&&(line[0] != '>')) {

    strip(&line);    // <-- assigns a `strndup()` allocation to `line`

    sequences[numsequences-1].sequence = realloc(sequences[numsequences-1].sequence, strlen(sequences[numsequences-1].sequence)+strlen(line)+1);
    strcat(sequences[numsequences-1].sequence,line);

    rcount = getline(&line, &linelen, f);   // <-- the buffer `line` points to might be
                                            //      smaller than `linelen` bytes

}
+1
source

You should use lines that look something like this:

struct string {
    int len;
    char *ptr;
};

This prevents strncpy errors, as you seem to have seen, and makes strcat and friends run faster.

. memcpys. - :

int sstrcat(struct string *a, struct string *b)
{
    int len = a->len + b->len;
    int alen = a->len;
    if (a->len < len) {
        while (a->len < len) {
            a->len *= 2;
        }
        a->ptr = realloc(a->ptr, a->len);
        if (a->ptr == NULL) {
            return ENOMEM;
        }
    }
    memcpy(&a->ptr[alen], b->ptr, b->len);
    return 0;
}

, , , , , , . :

struct string {
    int len;
    char ptr[0];
};

, , malloc(sizeof(struct string) + len) malloc. , , .

, , , . Valgrind , gdb .

+4

:

strncpy(ret,s,j-i);
return ret;

ret . . man strncpy:

       char *strncpy(char *dest, const char *src, size_t n);

       ...

       The strncpy() function is similar, except that at most n bytes  of  src
       are  copied.  Warning: If there is no null byte among the first n bytes
       of src, the string placed in dest will not be null terminated.

:

j = strlen(*s)-1;
while ((isspace(*(*s+j)))&&(j > 0)) {

, strlen(*s) 0? (*s)[-1].

strip(), . , j < i.

edit: Just noticed that your function substr()is not actually called.

+3
source

All Articles