title: redis5.0_01_SDS.字符串
tags: longzy:2018-12-2
在上一篇文章中,通过编译redis,设置断点执行,然后跟踪了redis的大概执行流程,那么从今天以后,将对redis源码的每个细节进行阅读分析。今天阅读分析的是redis字符串SDS。
在C语言中,字符串一般有两种表示方法
- char *buf1 = "redis_5.0";
- char buf2[] = "redis_5.0";
而在redis中,自己封装了一种叫简单动态字符串(simple dynamic string,SDS)的类型来表示的,同时也兼容了c语言的字符串。
SDS定义
定义是很简单的,我简单加了些注释和自己的看法。
//类型别名,实际指向下面sdshdrXX结构中的buf
typedef char *sds;
struct __attribute__ ((__packed__)) sdshdr5 {
unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
char buf[];
};
//sdshdr5 在redis_5.0中已经放弃使用,当最小的时候默认使用sdshdr8
struct __attribute__ ((__packed__)) sdshdr8 {
//字符串的实际长度,不包括空终止符
//在对sds求长度的时候也就是sdslen,复杂度为O(1),直接返回了len
uint8_t len; /* used */
//字符串的最大长度,不包括header的大小和最后的终止符
uint8_t alloc; /* excluding the header and null terminator */
//header的类型标志SDS_TYPE_8 SDS_TYPE_16 ......
unsigned char flags; /* 3 lsb of type, 5 unused bits */
//存储字符串的实际内容
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr16 {
uint16_t len; /* used */
uint16_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr32 {
uint32_t len; /* used */
uint32_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr64 {
uint64_t len; /* used */
uint64_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
/*
__attribute__ ((__packed__)): 告诉编译器取消结构在编译过程中的对齐优化,按照字节的实际大小对齐,这是GCC特有的语法
在GCC下:struct my{ char ch; int a;} sizeof(int)=4;sizeof(my)=8;(非紧凑模式)
在GCC下:struct my{ char ch; int a;}__attrubte__ ((packed))
*/
#define SDS_TYPE_5 0
#define SDS_TYPE_8 1
#define SDS_TYPE_16 2
#define SDS_TYPE_32 3
#define SDS_TYPE_64 4
#define SDS_TYPE_MASK 7
#define SDS_TYPE_BITS 3
#define SDS_HDR_VAR(T,s) struct sdshdr##T *sh = (void*)((s)-(sizeof(struct sdshdr##T)));
#define SDS_HDR(T,s) ((struct sdshdr##T *)((s)-(sizeof(struct sdshdr##T))))
#define SDS_TYPE_5_LEN(f) ((f)>>SDS_TYPE_BITS)
//##用在宏定义中,有连接的作用。
这里很是羞愧,之前居然没遇到过attribute ((packed))
下面分析几个核心的函数
sds sdsnewlen(const void *init, size_t initlen);
sds sdscatlen(sds s, const void *t, size_t len);
sds sdscatvprintf(sds s, const char *fmt, va_list ap);
sds sdscatfmt(sds s, char const *fmt, ...);
sds sdstrim(sds s, const char *cset);
void sdsrange(sds s, ssize_t start, ssize_t end);
sds *sdssplitlen(const char *s, ssize_t len, const char *sep, int seplen, int *count);
sds *sdssplitargs(const char *line, int *argc);
sds sdsMakeRoomFor(sds s, size_t addlen);
这几个函数从字面意思看起来都很明显,新建、连接、格式化、去掉指定字符、分割、扩展空间、移除空闲空间等等。另外这些函数都是在内分配内存,所以调用在需要在外面释放内存。
我们依次看看他们的实现
sds sdsnewlen(const void *init, size_t initlen)
这个函数没啥难度
/* Create a new sds string with the content specified by the 'init' pointer
* and 'initlen'.
* If NULL is used for 'init' the string is initialized with zero bytes.
*
* The string is always null-termined (all the sds strings are, always) so
* even if you create an sds string with:
*
* mystring = sdsnewlen("abc",3);
*
* You can print the string with printf() as there is an implicit \0 at the
* end of the string. However the string is binary safe and can contain
* \0 characters in the middle, as the length is stored in the sds header. */
/*
描述:根据指定的init和起长度initlen,创建新的sds
参数:
In:
init:初始化字符串指针
initlen: 字符串长度
返回值:
成功:返回新的sds
失败:返回NULL
*/
sds sdsnewlen(const void *init, size_t initlen) {
void *sh;
sds s;
//根据initlen长度来计算sdshdr 头部header的类型
char type = sdsReqType(initlen);
/* Empty strings are usually created in order to append. Use type 8
* since type 5 is not good at this. */
//默认使用SDS_TYPE_8
if (type == SDS_TYPE_5 && initlen == 0) type = SDS_TYPE_8;
//计算headr的大小
int hdrlen = sdsHdrSize(type);
//指向header flag的指针
unsigned char *fp; /* flags pointer. */
sh = s_malloc(hdrlen+initlen+1);
if (sh == NULL) return NULL;
if (!init)
memset(sh, 0, hdrlen+initlen+1);
s = (char*)sh+hdrlen;
fp = ((unsigned char*)s)-1;
switch(type) {
case SDS_TYPE_5: {
*fp = type | (initlen << SDS_TYPE_BITS);
break;
}
case SDS_TYPE_8: {
SDS_HDR_VAR(8,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
case SDS_TYPE_16: {
SDS_HDR_VAR(16,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
case SDS_TYPE_32: {
SDS_HDR_VAR(32,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
case SDS_TYPE_64: {
SDS_HDR_VAR(64,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
}
if (initlen && init)
memcpy(s, init, initlen);
s[initlen] = '\0';
return s;
}
sds sdsMakeRoomFor(sds s, size_t addlen);
/* Enlarge the free space at the end of the sds string so that the caller
* is sure that after calling this function can overwrite up to addlen
* bytes after the end of the string, plus one more byte for nul term.
*
* Note: this does not change the *length* of the sds string as returned
* by sdslen(), but only the free buffer space we have. */
/*
描述:对字符串扩展空间
参数:
IN:
s: 需要扩展的字符串
addlen: 扩展长度
返回值:
成功: 返回扩展后的sds
失败: NULL
*/
sds sdsMakeRoomFor(sds s, size_t addlen) {
void *sh, *newsh;
size_t avail = sdsavail(s);//可用的长度
size_t len, newlen;
char type, oldtype = s[-1] & SDS_TYPE_MASK;
int hdrlen;
/* Return ASAP if there is enough space left. */
//如果可用长度大于需要扩容的长度,直接返回
if (avail >= addlen) return s;
len = sdslen(s);
sh = (char*)s-sdsHdrSize(oldtype);
newlen = (len+addlen);
/*
SDS_MAX_PREALLOC = 2014*1024 = 1M
如果扩容后的<1M,那么新的字符串为扩容后的2倍
如果>=1M,那么新的字符串加上1M
*/
if (newlen < SDS_MAX_PREALLOC)
newlen *= 2;
else
newlen += SDS_MAX_PREALLOC;
//长度变了,需要重新获取新字符串的type
type = sdsReqType(newlen);
/* Don't use type 5: the user is appending to the string and type 5 is
* not able to remember empty space, so sdsMakeRoomFor() must be called
* at every appending operation. */
if (type == SDS_TYPE_5) type = SDS_TYPE_8;
hdrlen = sdsHdrSize(type);
//和oldtype比较,然后根据情况分配空间
if (oldtype==type) {
newsh = s_realloc(sh, hdrlen+newlen+1);
if (newsh == NULL) return NULL;
s = (char*)newsh+hdrlen;
} else {
/* Since the header size changes, need to move the string forward,
* and can't use realloc */
newsh = s_malloc(hdrlen+newlen+1);
if (newsh == NULL) return NULL;
memcpy((char*)newsh+hdrlen, s, len+1);
s_free(sh);
//释放sh指针,其实是释放s,为s重新指向新分配的newsh
//sh = (char*)s-sdsHdrSize(oldtype);
s = (char*)newsh+hdrlen;//重新给s赋值
s[-1] = type;
sdssetlen(s, len);
}
sdssetalloc(s, newlen);
return s;
}
sds sdscatlen(sds s, const void *t, size_t len)
/* Append the specified binary-safe string pointed by 't' of 'len' bytes to the
* end of the specified sds string 's'.
*
* After the call, the passed sds string is no longer valid and all the
* references must be substituted with the new pointer returned by the call. */
/*
描述:把长度为len的字符串t连接到s,类似于strcat
参数:
IN:
s:目标字符串
t:源字符串
len:t的长度
返回值:
成功:新的sds
失败:NULL
*/
sds sdscatlen(sds s, const void *t, size_t len) {
size_t curlen = sdslen(s);
s = sdsMakeRoomFor(s,len);//扩展连接字符串的长度
if (s == NULL) return NULL;
memcpy(s+curlen, t, len);
sdssetlen(s, curlen+len);
s[curlen+len] = '\0';
return s;
}
sds sdscatvprintf(sds s, const char *fmt, va_list ap);
/* Like sdscatprintf() but gets va_list instead of being variadic. */
/*
描述:格式化字符串
*/
sds sdscatvprintf(sds s, const char *fmt, va_list ap) {
va_list cpy;
char staticbuf[1024], *buf = staticbuf, *t;
size_t buflen = strlen(fmt)*2;
/* We try to start using a static buffer for speed.
* If not possible we revert to heap allocation. */
/*
这里先用栈区的staticbuf,如果fmt的2倍长度超过这个staticbuf在从堆去分配
这样做如果是小于1024的,直接用栈区的内存,非常快
这是预分配冗余空间的惯用手段,减小对内存的频繁分配
*/
if (buflen > sizeof(staticbuf)) {
buf = s_malloc(buflen);
if (buf == NULL) return NULL;
} else {
buflen = sizeof(staticbuf);
}
/* Try with buffers two times bigger every time we fail to
* fit the string in the current buffer size. */
while(1) {
buf[buflen-2] = '\0';//设置倒数第二个字符为结束符,
//方便后面判断是否超过了最终长度
va_copy(cpy,ap);
vsnprintf(buf, buflen, fmt, cpy);//调用是的vsprintf家族函数
va_end(cpy);
if (buf[buflen-2] != '\0') {//说明已经写满了,需要重新分配2倍大小,继续写
if (buf != staticbuf) s_free(buf);
buflen *= 2;
buf = s_malloc(buflen);
if (buf == NULL) return NULL;
continue;
}
break;
}
/* Finally concat the obtained string to the SDS string and return it. */
t = sdscat(s, buf);//这里底层调用的sdscatlen,是安全的
if (buf != staticbuf) s_free(buf);
return t;
}
sds sdscatfmt(sds s, char const *fmt, ...);
/* This function is similar to sdscatprintf, but much faster as it does
* not rely on sprintf() family functions implemented by the libc that
* are often very slow. Moreover directly handling the sds string as
* new data is concatenated provides a performance improvement.
*
* However this function only handles an incompatible subset of printf-alike
* format specifiers:
*
* %s - C String
* %S - SDS string
* %i - signed int
* %I - 64 bit signed integer (long long, int64_t)
* %u - unsigned int
* %U - 64 bit unsigned integer (unsigned long long, uint64_t)
* %% - Verbatim "%" character.
*/
/*
描述:更高效的格式化字符串,没有调用vsprintf家族函数
*/
sds sdscatfmt(sds s, char const *fmt, ...) {
size_t initlen = sdslen(s);
const char *f = fmt;
long i;
va_list ap;
va_start(ap,fmt);
f = fmt; /* Next format specifier byte to process. */
i = initlen; /* Position of the next byte to write to dest str. */
while(*f) {
char next, *str;
size_t l;
long long num;
unsigned long long unum;
/* Make sure there is always space for at least 1 char. */
//判断是否有 可用空间,没有的话扩展
if (sdsavail(s)==0) {
s = sdsMakeRoomFor(s,1);
}
switch(*f) {
case '%':
next = *(f+1);
f++;
switch(next) {
case 's':
case 'S':
str = va_arg(ap,char*);
//计算长度
l = (next == 's') ? strlen(str) : sdslen(str);
//如果可用空间不够,扩展
if (sdsavail(s) < l) {
s = sdsMakeRoomFor(s,l);
}
memcpy(s+i,str,l);
sdsinclen(s,l);//增加len的长度l
i += l;
break;
case 'i':
case 'I':
if (next == 'i')
num = va_arg(ap,int);
else
num = va_arg(ap,long long);
{
char buf[SDS_LLSTR_SIZE];
//逻辑处理同上,把long long 转为str
l = sdsll2str(buf,num);
if (sdsavail(s) < l) {
s = sdsMakeRoomFor(s,l);
}
memcpy(s+i,buf,l);
sdsinclen(s,l);
i += l;
}
break;
case 'u':
case 'U':
if (next == 'u')
unum = va_arg(ap,unsigned int);
else
unum = va_arg(ap,unsigned long long);
{
char buf[SDS_LLSTR_SIZE];
//逻辑处理同上,把unsignt long long 转为str
l = sdsull2str(buf,unum);
if (sdsavail(s) < l) {
s = sdsMakeRoomFor(s,l);
}
memcpy(s+i,buf,l);
sdsinclen(s,l);
i += l;
}
break;
default: /* Handle %% and generally %<unknown>. */
//除了上面的字符,其他的默认支持,即使是%
s[i++] = next;
sdsinclen(s,1);
break;
}
break;
default:
s[i++] = *f;
sdsinclen(s,1);
break;
}
f++;
}
va_end(ap);
/* Add null-term */
s[i] = '\0';
return s;
}
sds sdstrim(sds s, const char *cset);
/* Remove the part of the string from left and from right composed just of
* contiguous characters found in 'cset', that is a null terminted C string.
*
* After the call, the modified sds string is no longer valid and all the
* references must be substituted with the new pointer returned by the call.
*
* Example:
*
* s = sdsnew("AA...AA.a.aa.aHelloWorld :::");
* s = sdstrim(s,"Aa. :");
* printf("%s\n", s);
*
* Output will be just "Hello World".
*/
/*
除去字符串s中在字符串cset中出现的所有字符
*/
sds sdstrim(sds s, const char *cset) {
char *start, *end, *sp, *ep;
size_t len;
sp = start = s;
ep = end = s+sdslen(s)-1;
while(sp <= end && strchr(cset, *sp)) sp++;
while(ep > sp && strchr(cset, *ep)) ep--;
len = (sp > ep) ? 0 : ((ep-sp)+1);
if (s != sp) memmove(s, sp, len);
s[len] = '\0';
sdssetlen(s,len);
return s;
}
void sdsrange(sds s, ssize_t start, ssize_t end);
/* Turn the string into a smaller (or equal) string containing only the
* substring specified by the 'start' and 'end' indexes.
*
* start and end can be negative, where -1 means the last character of the
* string, -2 the penultimate character, and so forth.
*
* The interval is inclusive, so the start and end characters will be part
* of the resulting string.
*
* The string is modified in-place.
*
* Example:
*
* s = sdsnew("Hello World");
* sdsrange(s,1,-1); => "ello World"
*/
/*
取区间[start,end]的字符串,下标从0开始,-1表示最后一个
*/
void sdsrange(sds s, ssize_t start, ssize_t end) {
size_t newlen, len = sdslen(s);
if (len == 0) return;
if (start < 0) {
start = len+start;
if (start < 0) start = 0;
}
if (end < 0) {
end = len+end;
if (end < 0) end = 0;
}
newlen = (start > end) ? 0 : (end-start)+1;
if (newlen != 0) {
if (start >= (ssize_t)len) {
newlen = 0;
} else if (end >= (ssize_t)len) {
end = len-1;
newlen = (start > end) ? 0 : (end-start)+1;
}
} else {
start = 0;
}
if (start && newlen) memmove(s, s+start, newlen);
s[newlen] = 0;
sdssetlen(s,newlen);
}
sds *sdssplitlen(const char *s, ssize_t len, const char *sep, int seplen, int *count);
/* Split 's' with separator in 'sep'. An array
* of sds strings is returned. *count will be set
* by reference to the number of tokens returned.
*
* On out of memory, zero length string, zero length
* separator, NULL is returned.
*
* Note that 'sep' is able to split a string using
* a multi-character separator. For example
* sdssplit("foo_-_bar","_-_"); will return two
* elements "foo" and "bar".
*
* This version of the function is binary-safe but
* requires length arguments. sdssplit() is just the
* same function but for zero-terminated strings.
*/
sds *sdssplitlen(const char *s, ssize_t len, const char *sep, int seplen, int *count) {
int elements = 0, slots = 5;
long start = 0, j;
sds *tokens;
if (seplen < 1 || len < 0) return NULL;
tokens = s_malloc(sizeof(sds)*slots);
if (tokens == NULL) return NULL;
if (len == 0) {
*count = 0;
return tokens;
}
for (j = 0; j < (len-(seplen-1)); j++) {
/* make sure there is room for the next element and the final one */
if (slots < elements+2) {
sds *newtokens;
slots *= 2;
newtokens = s_realloc(tokens,sizeof(sds)*slots);
if (newtokens == NULL) goto cleanup;
tokens = newtokens;
}
/* search the separator */
if ((seplen == 1 && *(s+j) == sep[0]) || (memcmp(s+j,sep,seplen) == 0)) {
tokens[elements] = sdsnewlen(s+start,j-start);
if (tokens[elements] == NULL) goto cleanup;
elements++;
start = j+seplen;
j = j+seplen-1; /* skip the separator */
}
}
/* Add the final element. We are sure there is room in the tokens array. */
tokens[elements] = sdsnewlen(s+start,len-start);
if (tokens[elements] == NULL) goto cleanup;
elements++;
*count = elements;
return tokens;
cleanup:
{
int i;
for (i = 0; i < elements; i++) sdsfree(tokens[i]);
s_free(tokens);
*count = 0;
return NULL;
}
}
上面分析的关键函数是redis实现SDS的核心函数,像外部接口sdsnew底层调用的是sdsnewlen,sdscpy、sdscat等底层调用的是sdscatlen。
其实c语言的字符串已经能够满足基本全部需求,为什么redis还要自己实现字符串sds呢?
要回答这个问题,还是回到开头说的c语言对于字符串的一般定义。其通常如下:
- char *buf1 = "redis_5.0";
- char buf2[] = "redis_5.0";
这两种都表示一个字符串常量,第一种方式不可以在修改,第二种方式可以修改,但是大小固定。再想想平时对字符串的操作函数,strcpy、strcat等函数,一般是不安全的。
那么我们对比redis的实现,可以看出redis具有以下优点:
- 兼容c语言字符串
- 对于普通字符串的操作是安全的
- 可以动态扩展空间(最大是512M)
- 对字符串求长度的复杂度为O(1)
- 底层用的是数组,操作很快
- 从sdsMakeRoomFor的实现,我们知道redis采用了预分配冗余空间的方式来减小内存的频繁分配