diff --git a/src/deps/src/zstd-nginx-module/.gitattributes b/src/deps/src/zstd-nginx-module/.gitattributes new file mode 100644 index 000000000..6fe6f35ce --- /dev/null +++ b/src/deps/src/zstd-nginx-module/.gitattributes @@ -0,0 +1 @@ +*.t linguist-language=Text diff --git a/src/deps/src/zstd-nginx-module/.gitignore b/src/deps/src/zstd-nginx-module/.gitignore new file mode 100644 index 000000000..e3bcd3cf6 --- /dev/null +++ b/src/deps/src/zstd-nginx-module/.gitignore @@ -0,0 +1,54 @@ +# Prerequisites +*.d + +# Object files +*.o +*.ko +*.obj +*.elf + +# Linker output +*.ilk +*.map +*.exp + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +*.a +*.la +*.lo + +# Shared objects (inc. Windows DLLs) +*.dll +*.so +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Debug files +*.dSYM/ +*.su +*.idb +*.pdb + +# Kernel Module Compile Results +*.mod* +*.cmd +.tmp_versions/ +modules.order +Module.symvers +Mkfile.old +dkms.conf + +t/servroot/* diff --git a/src/deps/src/zstd-nginx-module/LICENSE b/src/deps/src/zstd-nginx-module/LICENSE new file mode 100644 index 000000000..b4d1280c9 --- /dev/null +++ b/src/deps/src/zstd-nginx-module/LICENSE @@ -0,0 +1,25 @@ +BSD 2-Clause License + +Copyright (c) 2018, Alex Zhang +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/deps/src/zstd-nginx-module/README.md b/src/deps/src/zstd-nginx-module/README.md new file mode 100644 index 000000000..a1a7f4e90 --- /dev/null +++ b/src/deps/src/zstd-nginx-module/README.md @@ -0,0 +1,155 @@ +# Name +zstd-nginx-module - Nginx module for the [Zstandard compression](https://facebook.github.io/zstd/). + +# Table of Content + +* [Name](#name) +* [Status](#status) +* [Synopsis](#synopsis) +* [Installation](#installation) +* [Directives](#directives) + * [ngx_http_zstd_filter_module](#ngx_http_zstd_filter_module) + * [zstd_dict_file](#zstd_dict_file) + * [zstd](#zstd) + * [zstd_comp_level](#zstd_comp_level) + * [zstd_min_length](#zstd_min_length) + * [zstd_types](#zstd_types) + * [zstd_buffers](#zstd_buffers) + * [ngx_http_zstd_static_module](#ngx_http_zstd_static_module) + * [zstd_static](#zstd_static) +* [Variables](#variables) + * [ngx_http_zstd_filter_module](#ngx_http_zstd_filter_module) + * [$zstd_ratio](#$zstd_ratio) +* [Author](#author) + +# Status + +This Nginx module is currently considered experimental. Issues and PRs are welcome if you encounter any problems. + +# Synopsis + +```nginx + +# specify the dictionary +zstd_dict_file /path/to/dict; + +server { + listen 127.0.0.1:8080; + server_name localhost; + + location / { + # enable zstd compression + zstd on; + zstd_min_length 256; # no less than 256 bytes + zstd_comp_level 3; # set the level to 3 + + proxy_pass http://foo.com; + } +} + +server { + listen 127.0.0.1:8081; + server_name localhost; + + location / { + zstd_static on; + root html; + } +} +``` + +# Installation + +To use theses modules, configure your nginx branch with `--add-module=/path/to/zstd-nginx-module`. Several points should be taken care. + +* You can set environment variables `ZSTD_INC` and `ZSTD_LIB` to specify the path to `zstd.h` and the path to zstd shared library represently. +* static library will be tried prior to dynamic library, since this Nginx module uses some **advanced APIs** where static linking is recommended. +* System's zstd bundle will be linked if `ZSTD_INC` and `ZSTD_LIB` are not specified. +* Both `ngx_http_zstd_static_module` and `ngx_http_zstd_filter_module` will be configured. + +# Directives + +## ngx_http_zstd_filter_module + +The `ngx_http_zstd_filter_module` module is a filter that compresses responses using the "zstd" method. This often helps to reduce the size of transmitted data by half or even more. + +### zstd_dict_file + +**Syntax:** *zstd_dict_file /path/to/dict;* +**Default:** *-* +**Context:** *http* + +Specifies the external dictionary. + +**WARNING:** Be careful! The content-coding registration only specifies a means to signal the use of the zstd format, and does not additionally specify any mechanism for advertising/negotiating/synchronizing the use of a specific dictionary between client and server. Use the `zstd_dict_file` only if you can insure that both ends (server and client) are capable of using the same dictionary (e.g. advertise with a HTTP header). See https://github.com/tokers/zstd-nginx-module/issues/2 for the details. + +### zstd + +**Syntax:** *zstd on | off;* +**Default:** *zstd off;* +**Context:** *http, server, location, if in location* + +Enables or disables zstd compression for response. + +### zstd_comp_level + +**Syntax:** *zstd_comp_level level;* +**Default:** *zstd_comp_level 1;* +**Context:** *http, server, location* + +Sets a zstd compression level of a response. Acceptable values are in the range from 1 to `ZSTD_maxCLevel()`. + +### zstd_min_length + +**Syntax:** *zstd_min_length length;* +**Default:** *zstd_min_length 20;* +**Context:** *http, server, location* + +Sets the minimum length of a response that will be compressed by zstd. The length is determined only from the "Content-Length" response header field. + +### zstd_types + +**Syntax:** *zstd_types mime-type ...;* +**Default:** *zstd_types text/html;* +**Context:** *http, server, location* + +Enables ztd of responses for the specified MIME types in addition to "text/html". The special value "*" matches any MIME type. + +### zstd_buffers + +**Syntax:** *zstd_buffers number size;* +**Default:** *zstd_buffers 32 4k | 16 8k;* +**Context:** *http, server, location* + +Sets the number and size of buffers used to compress a response. By default, the buffer size is equal to one memory page. This is either 4K or 8K, depending on a platform. + +## ngx_http_zstd_static_module + +The `ngx_http_zstd_static_module` module allows sending precompressed files with the ".zst" filename extension instead of regular files. + +### zstd_static + +**Syntax:** *zstd_static on | off | always;* +**Default:** *zstd_static off;* +**Context:** *http, server, location* + +Enables ("on") or disables ("off") checking the existence of precompressed files. The following directives are also taken into account: gzip_vary. + +With the "always" value, "zsted" file is used in all cases, without checking if the client supports it. + + +# Variables + +## ngx_http_zstd_filter_module + +### $zstd_ratio + +Achieved compression ratio, computed as the ratio between the original and compressed response sizes. + +# Author + +Alex Zhang (张超) zchao1995@gmail, UPYUN Inc. + +# License + +This Nginx module is licensed under [BSD 2-Clause License](LICENSE). diff --git a/src/deps/src/zstd-nginx-module/config b/src/deps/src/zstd-nginx-module/config new file mode 100644 index 000000000..3d138836f --- /dev/null +++ b/src/deps/src/zstd-nginx-module/config @@ -0,0 +1,11 @@ +# Make sure the module knows it is a submodule. +ngx_addon_name=ngx_zstd +. $ngx_addon_dir/filter/config + +# Make sure the module knows it is a submodule. +ngx_addon_name=ngx_zstd +. $ngx_addon_dir/static/config + +# The final name for reporting. +ngx_addon_name=ngx_zstd + diff --git a/src/deps/src/zstd-nginx-module/filter/config b/src/deps/src/zstd-nginx-module/filter/config new file mode 100644 index 000000000..2942125c6 --- /dev/null +++ b/src/deps/src/zstd-nginx-module/filter/config @@ -0,0 +1,144 @@ +ngx_feature_incs="#include " +ngx_feature_test="(void) ZSTD_createCCtx();" +ngx_feature_libs= +ngx_feature_run=yes + +ngx_zstd_opt_I= +ngx_zstd_opt_L= + +if [ -n "$ZSTD_INC" -o -n "$ZSTD_LIB" ]; then + ngx_feature="ZStandard static library in $ZSTD_INC and $ZSTD_LIB" + ngx_feature_path=$ZSTD_INC + + # we try the static shared library firstly + ngx_zstd_opt_I="-I$ZSTD_INC -DZSTD_STATIC_LINKING_ONLY" + ngx_zstd_opt_L="$ZSTD_LIB/libzstd.a" + SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS + CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS" + SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT + NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT" + + . auto/feature + + # restore + CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS + NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT + + if [ $ngx_found = no ]; then + # then try the dynamic shared library + ngx_feature="ZStandard dynamic library in $ZSTD_INC and $ZSTD_LIB" + ngx_zstd_opt_L="-L$ZSTD_LIB -lzstd -Wl,-rpath, $ZSTD_LIB" + + SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS + CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS" + SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT + NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT" + + . auto/feature + + # restore + CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS + NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT + + if [ $ngx_found = no ]; then + cat << END + $0: error: ngx_http_zstd_filter_module requires the ZStandard library, please be sure that "\$ZSTD_INC" and "\$ZSTD_LIB" are set correctly. +END + exit 1 + fi + + fi +else + # auto-discovery + ngx_feature="ZStandard static library" + ngx_zstd_opt_I="-DZSTD_STATIC_LINKING_ONLY" + ngx_zstd_opt_L="-l:libzstd.a" + + # still we consider the static library firstly + SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS + CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS" + SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT + NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT" + + . auto/feature + + # restore + CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS + NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT + + if [ $ngx_found = no ]; then + + ngx_feature="ZStandard dynamic library" + ngx_zstd_opt_L="-lzstd" + SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS + CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS" + SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT + NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT" + + . auto/feature + + if [ $ngx_found = no ]; then + cat << END + $0: error: ngx_http_zstd_filter_module requires the ZStandard library. +END + exit 1 + fi + + # restore + CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS + NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT + + cat << END + $0: warning: ngx_http_zstd_filter_module uses advanced ZStandard APIs (which are still considered experimental) while you are trying to link the dynamic shared library. +END + fi + + # TODO we need more tries for the different OS port. +fi + +NGX_LD_OPT="$ngx_zstd_opt_L $NGX_LD_OPT" + +HTTP_ZSTD_SRCS="$ngx_addon_dir/filter/ngx_http_zstd_filter_module.c" + +ngx_addon_name=ngx_http_zstd_filter_module +ngx_module_type=HTTP_FILTER +ngx_module_name=ngx_http_zstd_filter_module +ngx_module_incs="$ngx_zstd_opt_I" +ngx_module_srcs=$HTTP_ZSTD_SRCS +ngx_module_libs=$NGX_LD_OPT +ngx_module_order="$ngx_module_name \ + ngx_pagespeed \ + ngx_http_postpone_filter_module \ + ngx_http_ssi_filter_module \ + ngx_http_charset_filter_module \ + ngx_http_xslt_filter_module \ + ngx_http_image_filter_module \ + ngx_http_sub_filter_module \ + ngx_http_addition_filter_module \ + ngx_http_gunzip_filter_module \ + ngx_http_userid_filter_module \ + ngx_http_headers_filter_module \ + ngx_http_copy_filter_module \ + ngx_http_range_body_filter_module \ + ngx_http_not_modified_filter_module \ + ngx_http_slice_filter_module" + +. auto/module + +if [ "$ngx_module_link" != DYNAMIC ]; then + # ngx_module_order doesn't work with static modules, + # so we must re-order filters here. + + if [ "$HTTP_GZIP" = YES ]; then + next=ngx_http_gzip_filter_module + elif echo $HTTP_FILTER_MODULES | grep pagespeed_etag_filter >/dev/null; then + next=ngx_pagespeed_etag_filter + else + next=ngx_http_range_header_filter_module + fi + + HTTP_FILTER_MODULES=`echo $HTTP_FILTER_MODULES \ + | sed "s/$ngx_module_name//" \ + | sed "s/$next/$next $ngx_module_name/"` +fi + diff --git a/src/deps/src/zstd-nginx-module/filter/ngx_http_zstd_filter_module.c b/src/deps/src/zstd-nginx-module/filter/ngx_http_zstd_filter_module.c new file mode 100644 index 000000000..50ec55f12 --- /dev/null +++ b/src/deps/src/zstd-nginx-module/filter/ngx_http_zstd_filter_module.c @@ -0,0 +1,1035 @@ + +/* + * Copyright (C) Alex Zhang + */ + + +#include +#include +#include + +#include + + +#define NGX_HTTP_ZSTD_FILTER_COMPRESS 0 +#define NGX_HTTP_ZSTD_FILTER_FLUSH 1 +#define NGX_HTTP_ZSTD_FILTER_END 2 + + +typedef struct { + ngx_str_t dict_file; +} ngx_http_zstd_main_conf_t; + + +typedef struct { + ngx_flag_t enable; + ngx_int_t level; + ssize_t min_length; + + ngx_hash_t types; + + ngx_bufs_t bufs; + + ngx_array_t *types_keys; + + ZSTD_CDict *dict; +} ngx_http_zstd_loc_conf_t; + + +typedef struct { + ngx_chain_t *in; + ngx_chain_t *free; + ngx_chain_t *busy; + ngx_chain_t *out; + ngx_chain_t **last_out; + + ngx_buf_t *in_buf; + ngx_buf_t *out_buf; + ngx_int_t bufs; + + ZSTD_inBuffer buffer_in; + ZSTD_outBuffer buffer_out; + + ZSTD_CStream *cstream; + + ngx_http_request_t *request; + + size_t bytes_in; + size_t bytes_out; + + unsigned action:2; + unsigned last:1; + unsigned redo:1; + unsigned flush:1; + unsigned done:1; + unsigned nomem:1; +} ngx_http_zstd_ctx_t; + + +typedef struct { + ngx_conf_post_handler_pt post_handler; +} ngx_http_zstd_comp_level_bounds_t; + + +static ngx_http_output_header_filter_pt ngx_http_next_header_filter; +static ngx_http_output_body_filter_pt ngx_http_next_body_filter; + +static ngx_str_t ngx_http_zstd_ratio = ngx_string("zstd_ratio"); + + +static ngx_int_t ngx_http_zstd_header_filter(ngx_http_request_t *r); +static ngx_int_t ngx_http_zstd_body_filter(ngx_http_request_t *r, + ngx_chain_t *in); +static ngx_int_t ngx_http_zstd_filter_add_data(ngx_http_request_t *r, + ngx_http_zstd_ctx_t *ctx); +static ngx_int_t ngx_http_zstd_filter_get_buf(ngx_http_request_t *r, + ngx_http_zstd_ctx_t *ctx); +static ZSTD_CStream *ngx_http_zstd_filter_create_cstream(ngx_http_request_t *r, + ngx_http_zstd_ctx_t *ctx); +static ngx_int_t ngx_http_zstd_filter_compress(ngx_http_request_t *r, + ngx_http_zstd_ctx_t *ctx); +static ngx_int_t ngx_http_zstd_accept_encoding(ngx_str_t *ae); +static ngx_int_t ngx_http_zstd_ok(ngx_http_request_t *r); +static ngx_int_t ngx_http_zstd_filter_init(ngx_conf_t *cf); +static void * ngx_http_zstd_create_main_conf(ngx_conf_t *cf); +static char *ngx_http_zstd_init_main_conf(ngx_conf_t *cf, void *conf); +static void *ngx_http_zstd_create_loc_conf(ngx_conf_t *cf); +static char *ngx_http_zstd_merge_loc_conf(ngx_conf_t *cf, void *parent, + void *child); +static ngx_int_t ngx_http_zstd_add_variables(ngx_conf_t *cf); +static ngx_int_t ngx_http_zstd_ratio_variable(ngx_http_request_t *r, + ngx_http_variable_value_t *vv, uintptr_t data); +static void * ngx_http_zstd_filter_alloc(void *opaque, size_t size); +static void ngx_http_zstd_filter_free(void *opaque, void *address); +static char *ngx_http_zstd_comp_level(ngx_conf_t *cf, void *post, void *data); +static char *ngx_conf_zstd_set_num_slot_with_negatives(ngx_conf_t *cf, ngx_command_t *cmd, void *conf); + + +static ngx_http_zstd_comp_level_bounds_t ngx_http_zstd_comp_level_bounds = { + ngx_http_zstd_comp_level +}; + + +static ngx_command_t ngx_http_zstd_filter_commands[] = { + + { ngx_string("zstd"), + NGX_HTTP_MAIN_CONF|NGX_HTTP_SRV_CONF|NGX_HTTP_LOC_CONF|NGX_HTTP_LIF_CONF + |NGX_CONF_FLAG, + ngx_conf_set_flag_slot, + NGX_HTTP_LOC_CONF_OFFSET, + offsetof(ngx_http_zstd_loc_conf_t, enable), + NULL }, + + { ngx_string("zstd_comp_level"), + NGX_HTTP_MAIN_CONF|NGX_HTTP_SRV_CONF|NGX_HTTP_LOC_CONF|NGX_CONF_TAKE1, + ngx_conf_zstd_set_num_slot_with_negatives, + NGX_HTTP_LOC_CONF_OFFSET, + offsetof(ngx_http_zstd_loc_conf_t, level), + &ngx_http_zstd_comp_level_bounds }, + + { ngx_string("zstd_types"), + NGX_HTTP_MAIN_CONF|NGX_HTTP_SRV_CONF|NGX_HTTP_LOC_CONF|NGX_CONF_1MORE, + ngx_http_types_slot, + NGX_HTTP_LOC_CONF_OFFSET, + offsetof(ngx_http_zstd_loc_conf_t, types_keys), + &ngx_http_html_default_types[0] }, + + { ngx_string("zstd_buffers"), + NGX_HTTP_MAIN_CONF|NGX_HTTP_SRV_CONF|NGX_HTTP_LOC_CONF|NGX_CONF_TAKE2, + ngx_conf_set_bufs_slot, + NGX_HTTP_LOC_CONF_OFFSET, + offsetof(ngx_http_zstd_loc_conf_t, bufs), + NULL }, + + { ngx_string("zstd_min_length"), + NGX_HTTP_MAIN_CONF|NGX_HTTP_SRV_CONF|NGX_HTTP_LOC_CONF|NGX_CONF_1MORE, + ngx_conf_set_size_slot, + NGX_HTTP_LOC_CONF_OFFSET, + offsetof(ngx_http_zstd_loc_conf_t, min_length), + NULL }, + + { ngx_string("zstd_dict_file"), + NGX_HTTP_MAIN_CONF|NGX_CONF_TAKE1, + ngx_conf_set_str_slot, + NGX_HTTP_MAIN_CONF_OFFSET, + offsetof(ngx_http_zstd_main_conf_t, dict_file), + NULL }, + + ngx_null_command +}; + + +static ngx_http_module_t ngx_http_zstd_filter_module_ctx = { + ngx_http_zstd_add_variables, /* preconfiguration */ + ngx_http_zstd_filter_init, /* postconfiguration */ + + ngx_http_zstd_create_main_conf, /* create main configuration */ + ngx_http_zstd_init_main_conf, /* init main configuration */ + + NULL, /* create server configuration */ + NULL, /* merge server configuration */ + + ngx_http_zstd_create_loc_conf, /* create location configuration */ + ngx_http_zstd_merge_loc_conf, /* merge location configuration */ +}; + + +ngx_module_t ngx_http_zstd_filter_module = { + NGX_MODULE_V1, + &ngx_http_zstd_filter_module_ctx, /* module context */ + ngx_http_zstd_filter_commands, /* module directives */ + NGX_HTTP_MODULE, /* module type */ + NULL, /* init master */ + NULL, /* init module */ + NULL, /* init process */ + NULL, /* init thread */ + NULL, /* exit thread */ + NULL, /* exit process */ + NULL, /* exit master */ + NGX_MODULE_V1_PADDING +}; + + +static ngx_int_t +ngx_http_zstd_header_filter(ngx_http_request_t *r) +{ + ngx_table_elt_t *h; + ngx_http_zstd_loc_conf_t *zlcf; + ngx_http_zstd_ctx_t *ctx; + + zlcf = ngx_http_get_module_loc_conf(r, ngx_http_zstd_filter_module); + + if (!zlcf->enable + || (r->headers_out.status != NGX_HTTP_OK + && r->headers_out.status != NGX_HTTP_FORBIDDEN + && r->headers_out.status != NGX_HTTP_NOT_FOUND) + || (r->headers_out.content_encoding + && r->headers_out.content_encoding->value.len) + || (r->headers_out.content_length_n != -1 + && r->headers_out.content_length_n < zlcf->min_length) + || ngx_http_test_content_type(r, &zlcf->types) == NULL + || r->header_only) + { + return ngx_http_next_header_filter(r); + } + + r->gzip_vary = 1; + + if (ngx_http_zstd_ok(r) != NGX_OK) { + return ngx_http_next_header_filter(r); + } + + ctx = ngx_pcalloc(r->pool, sizeof(ngx_http_zstd_ctx_t)); + if (ctx == NULL) { + return NGX_ERROR; + } + + ngx_http_set_ctx(r, ctx, ngx_http_zstd_filter_module); + + ctx->request = r; + ctx->last_out = &ctx->out; + + h = ngx_list_push(&r->headers_out.headers); + if (h == NULL) { + return NGX_ERROR; + } + + h->hash = 1; + ngx_str_set(&h->key, "Content-Encoding"); + ngx_str_set(&h->value, "zstd"); + r->headers_out.content_encoding = h; + + r->main_filter_need_in_memory = 1; + + ngx_http_clear_content_length(r); + ngx_http_clear_accept_ranges(r); + ngx_http_weak_etag(r); + + return ngx_http_next_header_filter(r); +} + + +static ngx_int_t +ngx_http_zstd_body_filter(ngx_http_request_t *r, ngx_chain_t *in) +{ + size_t rv; + ngx_int_t flush, rc; + ngx_chain_t *cl; + ngx_http_zstd_ctx_t *ctx; + + + ctx = ngx_http_get_module_ctx(r, ngx_http_zstd_filter_module); + + if (ctx == NULL || ctx->done || r->header_only) { + return ngx_http_next_body_filter(r, in); + } + + ngx_log_debug0(NGX_LOG_DEBUG_HTTP, r->connection->log, 0, + "http zstd filter"); + + if (ctx->cstream == NULL) { + ctx->cstream = ngx_http_zstd_filter_create_cstream(r, ctx); + if (ctx->cstream == NULL) { + goto failed; + } + } + + if (in) { + if (ngx_chain_add_copy(r->pool, &ctx->in, in) != NGX_OK) { + goto failed; + } + + r->connection->buffered |= NGX_HTTP_GZIP_BUFFERED; + } + + if (ctx->nomem) { + + /* flush busy buffers */ + + if (ngx_http_next_body_filter(r, NULL) == NGX_ERROR) { + goto failed; + } + + cl = NULL; + + ngx_chain_update_chains(r->pool, &ctx->free, &ctx->busy, &cl, + (ngx_buf_tag_t) &ngx_http_zstd_filter_module); + + flush = 0; + ctx->nomem = 0; + + } else { + flush = ctx->busy ? 1 : 0; + } + + for ( ;; ) { + + /* cycle while we can write to a client */ + + for ( ;; ) { + + rc = ngx_http_zstd_filter_add_data(r, ctx); + + if (rc == NGX_DECLINED) { + break; + } + + if (rc == NGX_AGAIN) { + continue; + } + + rc = ngx_http_zstd_filter_get_buf(r, ctx); + + if (rc == NGX_ERROR) { + goto failed; + } + + if (rc == NGX_DECLINED) { + break; + } + + rc = ngx_http_zstd_filter_compress(r, ctx); + + if (rc == NGX_ERROR) { + goto failed; + } + + if (rc == NGX_OK) { + break; + } + + /* rc == NGX_AGAIN */ + } + + if (ctx->out == NULL && !flush) { + return ctx->busy ? NGX_AGAIN : NGX_OK; + } + + rc = ngx_http_next_body_filter(r, ctx->out); + + if (rc == NGX_ERROR) { + goto failed; + } + + ngx_chain_update_chains(r->pool, &ctx->free, &ctx->busy, &ctx->out, + (ngx_buf_tag_t) &ngx_http_zstd_filter_module); + + ctx->last_out = &ctx->out; + ctx->nomem = 0; + flush = 0; + + if (ctx->done) { + rv = ZSTD_freeCStream(ctx->cstream); + if (ZSTD_isError(rv)) { + ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0, + "ZSTD_freeCStream() failed: %s", + ZSTD_getErrorName(rc)); + + rc = NGX_ERROR; + } + + return rc; + } + } + +failed: + + ctx->done = 1; + rv = ZSTD_freeCStream(ctx->cstream); + if (ZSTD_isError(rv)) { + ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0, + "ZSTD_freeCStream() failed: %s", ZSTD_getErrorName(rv)); + } + + return NGX_ERROR; +} + + +static ngx_int_t +ngx_http_zstd_filter_compress(ngx_http_request_t *r, ngx_http_zstd_ctx_t *ctx) +{ + size_t rc, pos_in, pos_out; + char *hint; + ngx_chain_t *cl; + ngx_buf_t *b; + + ngx_log_debug8(NGX_LOG_DEBUG_HTTP, r->connection->log, 0, + "zstd compress in: src:%p pos:%ud size: %ud, " + "dst:%p pos:%ud size:%ud flush:%d redo:%d", + ctx->buffer_in.src, ctx->buffer_in.pos, ctx->buffer_in.size, + ctx->buffer_out.dst, ctx->buffer_out.pos, + ctx->buffer_out.size, ctx->flush, ctx->redo); + + pos_in = ctx->buffer_in.pos; + pos_out = ctx->buffer_out.pos; + + switch (ctx->action) { + + case NGX_HTTP_ZSTD_FILTER_FLUSH: + hint = "ZSTD_flushStream() "; + rc = ZSTD_flushStream(ctx->cstream, &ctx->buffer_out); + break; + + case NGX_HTTP_ZSTD_FILTER_END: + hint = "ZSTD_endStream() "; + rc = ZSTD_endStream(ctx->cstream, &ctx->buffer_out); + break; + + default: + hint = "ZSTD_compressStream() "; + rc = ZSTD_compressStream(ctx->cstream, &ctx->buffer_out, + &ctx->buffer_in); + break; + } + + if (ZSTD_isError(rc)) { + ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0, + "%s failed: %s", hint, ZSTD_getErrorName(rc)); + + return NGX_ERROR; + } + + ngx_log_debug6(NGX_LOG_DEBUG_HTTP, r->connection->log, 0, + "zstd compress out: src:%p pos:%ud size: %ud, " + "dst:%p pos:%ud size:%ud", + ctx->buffer_in.src, ctx->buffer_in.pos, ctx->buffer_in.size, + ctx->buffer_out.dst, ctx->buffer_out.pos, + ctx->buffer_out.size); + + ctx->in_buf->pos += ctx->buffer_in.pos - pos_in; + ctx->out_buf->last += ctx->buffer_out.pos - pos_out; + ctx->redo = 0; + + if (rc > 0) { + if (ctx->action == NGX_HTTP_ZSTD_FILTER_COMPRESS) { + ctx->action = NGX_HTTP_ZSTD_FILTER_FLUSH; + } + + ctx->redo = 1; + + } else if (ctx->last && ctx->action != NGX_HTTP_ZSTD_FILTER_END) { + ctx->redo = 1; + ctx->action = NGX_HTTP_ZSTD_FILTER_END; + + /* pending to call the ZSTD_endStream() */ + + return NGX_AGAIN; + + } else { + ctx->action = NGX_HTTP_ZSTD_FILTER_COMPRESS; /* restore */ + } + + if (ngx_buf_size(ctx->out_buf) == 0) { + return NGX_AGAIN; + } + + cl = ngx_alloc_chain_link(r->pool); + if (cl == NULL) { + return NGX_ERROR; + } + + b = ctx->out_buf; + + if (rc == 0 && (ctx->flush || ctx->last)) { + r->connection->buffered &= ~NGX_HTTP_GZIP_BUFFERED; + + b->flush = ctx->flush; + b->last_buf = ctx->last; + + ctx->done = ctx->last; + ctx->flush = 0; + } + + ctx->bytes_out += ngx_buf_size(b); + + cl->next = NULL; + cl->buf = b; + + *ctx->last_out = cl; + ctx->last_out = &cl->next; + + ngx_memzero(&ctx->buffer_out, sizeof(ZSTD_outBuffer)); + + return ctx->last && rc == 0 ? NGX_OK : NGX_AGAIN; +} + + +static ngx_int_t +ngx_http_zstd_filter_add_data(ngx_http_request_t *r, ngx_http_zstd_ctx_t *ctx) +{ + if (ctx->buffer_in.pos < ctx->buffer_in.size + || ctx->flush + || ctx->last + || ctx->redo) + { + return NGX_OK; + } + + ngx_log_debug1(NGX_LOG_DEBUG_HTTP, r->connection->log, 0, + "zstd in: %p", ctx->in); + + if (ctx->in == NULL) { + return NGX_DECLINED; + } + + ctx->in_buf = ctx->in->buf; + ctx->in = ctx->in->next; + + if (ctx->in_buf->flush) { + ctx->flush = 1; + + } else if (ctx->in_buf->last_buf) { + ctx->last = 1; + } + + ctx->buffer_in.src = ctx->in_buf->pos; + ctx->buffer_in.pos = 0; + ctx->buffer_in.size = ngx_buf_size(ctx->in_buf); + + ctx->bytes_in += ngx_buf_size(ctx->in_buf); + + if (ctx->buffer_in.size == 0) { + return NGX_AGAIN; + } + + return NGX_OK; +} + + +static ngx_int_t +ngx_http_zstd_filter_get_buf(ngx_http_request_t *r, ngx_http_zstd_ctx_t *ctx) +{ + ngx_chain_t *cl; + ngx_http_zstd_loc_conf_t *zlcf; + + if (ctx->buffer_out.pos < ctx->buffer_out.size) { + return NGX_OK; + } + + zlcf = ngx_http_get_module_loc_conf(r, ngx_http_zstd_filter_module); + + if (ctx->free) { + cl = ctx->free; + ctx->free = ctx->free->next; + ctx->out_buf = cl->buf; + ngx_free_chain(r->pool, cl); + + } else if (ctx->bufs < zlcf->bufs.num) { + ctx->out_buf = ngx_create_temp_buf(r->pool, zlcf->bufs.size); + if (ctx->out_buf == NULL) { + return NGX_ERROR; + } + + ctx->out_buf->tag = (ngx_buf_tag_t) &ngx_http_zstd_filter_module; + ctx->out_buf->recycled = 1; + ctx->bufs++; + + } else { + ctx->nomem = 1; + return NGX_DECLINED; + } + + ctx->buffer_out.dst = ctx->out_buf->pos; + ctx->buffer_out.pos = 0; + ctx->buffer_out.size = ctx->out_buf->end - ctx->out_buf->start; + + return NGX_OK; +} + + +static ZSTD_CStream * +ngx_http_zstd_filter_create_cstream(ngx_http_request_t *r, + ngx_http_zstd_ctx_t *ctx) +{ + size_t rc; + ZSTD_CStream *cstream; + ZSTD_customMem cmem; + ngx_http_zstd_loc_conf_t *zlcf; + + zlcf = ngx_http_get_module_loc_conf(r, ngx_http_zstd_filter_module); + + cmem.customAlloc = ngx_http_zstd_filter_alloc; + cmem.customFree = ngx_http_zstd_filter_free; + cmem.opaque = ctx; + + cstream = ZSTD_createCStream_advanced(cmem); + if (cstream == NULL) { + ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0, + "ZSTD_createCStream_advanced() failed"); + + return NULL; + } + + /* TODO use the advanced initialize functions */ + + if (zlcf->dict) { +#if ZSTD_VERSION_NUMBER >= 10500 + rc = ZSTD_CCtx_reset(cstream, ZSTD_reset_session_only); + if (ZSTD_isError(rc)) { + ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0, + "ZSTD_CCtx_reset() failed: %s", + ZSTD_getErrorName(rc)); + goto failed; + } + + rc = ZSTD_CCtx_refCDict(cstream, zlcf->dict); + if (ZSTD_isError(rc)) { + ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0, + "ZSTD_CCtx_refCDict() failed: %s", + ZSTD_getErrorName(rc)); + goto failed; + } +#else + rc = ZSTD_initCStream_usingCDict(cstream, zlcf->dict); +#endif + if (ZSTD_isError(rc)) { + ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0, + "ZSTD_initCStream_usingCDict() failed: %s", + ZSTD_getErrorName(rc)); + + goto failed; + } + + } else { + rc = ZSTD_initCStream(cstream, zlcf->level); + if (ZSTD_isError(rc)) { + ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0, + "ZSTD_initCStream() failed: %s", + ZSTD_getErrorName(rc)); + + goto failed; + } + } + + return cstream; + +failed: + rc = ZSTD_freeCStream(cstream); + if (ZSTD_isError(rc)) { + ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0, + "ZSTD_freeCStream() failed: %s", ZSTD_getErrorName(rc)); + } + + return NULL; +} + + +static ngx_int_t +ngx_http_zstd_accept_encoding(ngx_str_t *ae) +{ + u_char *p; + + p = ngx_strcasestrn(ae->data, "zstd", sizeof("zstd") - 2); + if (p == NULL) { + return NGX_DECLINED; + } + + if (p == ae->data || (*(p - 1) == ',' || *(p - 1) == ' ')) { + + p += sizeof("zstd") - 1; + + if (p == ae->data + ae->len || *p == ',' || *p == ' ' || *p == ';') { + return NGX_OK; + } + } + + return NGX_DECLINED; +} + + +static ngx_int_t +ngx_http_zstd_ok(ngx_http_request_t *r) +{ + ngx_table_elt_t *ae; + + if (r != r->main) { + return NGX_DECLINED; + } + + ae = r->headers_in.accept_encoding; + if (ae == NULL) { + return NGX_DECLINED; + } + + if (ae->value.len < sizeof("zstd") - 1) { + return NGX_DECLINED; + } + + if (ngx_memcmp(ae->value.data, "zstd", 4) != 0 + && ngx_http_zstd_accept_encoding(&ae->value) != NGX_OK) + { + return NGX_DECLINED; + } + + + r->gzip_tested = 1; + r->gzip_ok = 0; + + return NGX_OK; +} + + +static void * +ngx_http_zstd_create_main_conf(ngx_conf_t *cf) +{ + ngx_http_zstd_main_conf_t *zmcf; + + zmcf = ngx_pcalloc(cf->pool, sizeof(ngx_http_zstd_main_conf_t)); + if (zmcf == NULL) { + return NULL; + } + + return zmcf; +} + + +static char * +ngx_http_zstd_init_main_conf(ngx_conf_t *cf, void *conf) +{ + ngx_http_zstd_main_conf_t *zmcf = conf; + + if (zmcf->dict_file.len == 0) { + return NGX_CONF_OK; + } + + if (ngx_conf_full_name(cf->cycle, &zmcf->dict_file, 1) != NGX_OK) { + return NGX_CONF_ERROR; + } + + return NGX_CONF_OK; +} + + +static void * +ngx_http_zstd_create_loc_conf(ngx_conf_t *cf) +{ + ngx_http_zstd_loc_conf_t *conf; + + conf = ngx_pcalloc(cf->pool, sizeof(ngx_http_zstd_loc_conf_t)); + if (conf == NULL) { + return NULL; + } + + /* + * set by ngx_pcalloc(): + * + * conf->bufs.num = 0; + * conf->types = { NULL }; + * conf->types_keys = NULL; + * conf->dict = NULL; + */ + + conf->enable = NGX_CONF_UNSET; + conf->level = NGX_CONF_UNSET; + conf->min_length = NGX_CONF_UNSET; + + return conf; +} + + +static char * +ngx_http_zstd_merge_loc_conf(ngx_conf_t *cf, void *parent, void *child) +{ + ngx_http_zstd_loc_conf_t *prev = parent; + ngx_http_zstd_loc_conf_t *conf = child; + + ngx_fd_t fd; + size_t size; + ssize_t n; + char *rc; + u_char *buf; + ngx_file_info_t info; + ngx_http_zstd_main_conf_t *zmcf; + + rc = NGX_OK; + buf = NULL; + fd = NGX_INVALID_FILE; + + ngx_conf_merge_value(conf->enable, prev->enable, 0); + ngx_conf_merge_value(conf->level, prev->level, 1); + ngx_conf_merge_value(conf->min_length, prev->min_length, 20); + + if (ngx_http_merge_types(cf, &conf->types_keys, &conf->types, + &prev->types_keys, &prev->types, + ngx_http_html_default_types)) + { + return NGX_CONF_ERROR; + } + + ngx_conf_merge_ptr_value(conf->dict, prev->dict, NULL); + ngx_conf_merge_bufs_value(conf->bufs, prev->bufs, + (128 * 1024) / ngx_pagesize, ngx_pagesize); + + zmcf = ngx_http_conf_get_module_main_conf(cf, ngx_http_zstd_filter_module); + + if (conf->enable && zmcf->dict_file.len > 0) { + + if (conf->level == prev->level) { + conf->dict = prev->dict; + + } else { + /* + * compression level is different from the outer block, + * so we should create a seperate dict object. + */ + + fd = ngx_open_file(zmcf->dict_file.data, NGX_FILE_RDONLY, + NGX_FILE_OPEN, 0); + + if (fd == NGX_INVALID_FILE) { + ngx_conf_log_error(NGX_LOG_EMERG, cf, ngx_errno, + ngx_open_file_n " \"%V\" failed", + &zmcf->dict_file); + + return NGX_CONF_ERROR; + } + + if (ngx_fd_info(fd, &info) == NGX_FILE_ERROR) { + ngx_conf_log_error(NGX_LOG_EMERG, cf, ngx_errno, + ngx_fd_info_n " \"%V\" failed", + &zmcf->dict_file); + + rc = NGX_CONF_ERROR; + goto close; + } + + size = ngx_file_size(&info); + buf = ngx_palloc(cf->pool, size); + if (buf == NULL) { + rc = NGX_CONF_ERROR; + goto close; + } + + n = ngx_read_fd(fd, (void *) buf, size); + if (n < 0) { + ngx_conf_log_error(NGX_LOG_EMERG, cf, ngx_errno, + ngx_read_fd_n " %V\" failed", + &zmcf->dict_file); + + rc = NGX_CONF_ERROR; + goto close; + + } else if ((size_t) n != size) { + ngx_conf_log_error(NGX_LOG_EMERG, cf, ngx_errno, + ngx_read_fd_n "\"%V incomplete\"", + &zmcf->dict_file); + + rc = NGX_CONF_ERROR; + goto close; + } + + conf->dict = ZSTD_createCDict_byReference(buf, size, conf->level); + if (conf->dict == NULL) { + ngx_conf_log_error(NGX_LOG_EMERG, cf, 0, + "ZSTD_createCDict_byReference() failed"); + rc = NGX_CONF_ERROR; + goto close; + } + } + } + +close: + + if (fd != NGX_INVALID_FILE && ngx_close_file(fd) == NGX_FILE_ERROR) { + ngx_conf_log_error(NGX_LOG_EMERG, cf, ngx_errno, + ngx_close_file_n " \"%V\" failed", + &zmcf->dict_file); + + rc = NGX_CONF_ERROR; + } + + return rc; +} + + +static ngx_int_t +ngx_http_zstd_filter_init(ngx_conf_t *cf) +{ + ngx_http_next_header_filter = ngx_http_top_header_filter; + ngx_http_top_header_filter = ngx_http_zstd_header_filter; + + ngx_http_next_body_filter = ngx_http_top_body_filter; + ngx_http_top_body_filter = ngx_http_zstd_body_filter; + + return NGX_OK; +} + + +static void * +ngx_http_zstd_filter_alloc(void *opaque, size_t size) +{ + ngx_http_zstd_ctx_t *ctx = opaque; + + void *p; + + p = ngx_palloc(ctx->request->pool, size); + + ngx_log_debug2(NGX_LOG_DEBUG_HTTP, ctx->request->connection->log, 0, + "zstd alloc: %p, size: %uz", p, size); + + return p; +} + + +static ngx_int_t +ngx_http_zstd_add_variables(ngx_conf_t *cf) +{ + ngx_http_variable_t *v; + + v = ngx_http_add_variable(cf, &ngx_http_zstd_ratio, + NGX_HTTP_VAR_NOCACHEABLE); + if (v == NULL) { + return NGX_ERROR; + } + + v->get_handler = ngx_http_zstd_ratio_variable; + + return NGX_OK; +} + + +static ngx_int_t +ngx_http_zstd_ratio_variable(ngx_http_request_t *r, + ngx_http_variable_value_t *vv, uintptr_t data) +{ + ngx_uint_t ratio_int, ratio_frac; + ngx_http_zstd_ctx_t *ctx; + + ctx = ngx_http_get_module_ctx(r, ngx_http_zstd_filter_module); + if (ctx == NULL || !ctx->done || ctx->bytes_out == 0) { + vv->not_found = 1; + return NGX_OK; + } + + vv->data = ngx_pnalloc(r->pool, NGX_INT32_LEN + 3); + if (vv->data == NULL) { + return NGX_ERROR; + } + + ratio_int = (ngx_uint_t) ctx->bytes_in / ctx->bytes_out; + ratio_frac = (ngx_uint_t) (ctx->bytes_in * 1000 / ctx->bytes_out % 1000); + + vv->len = ngx_sprintf(vv->data, "%ui.%03ui", ratio_int, ratio_frac) + - vv->data; + + vv->valid = 1; + vv->no_cacheable = 1; + + return NGX_OK; +} + + +static void +ngx_http_zstd_filter_free(void *opaque, void *address) +{ +#if (NGX_DEBUG) + + ngx_http_zstd_ctx_t *ctx = opaque; + + ngx_log_debug1(NGX_LOG_DEBUG_HTTP, ctx->request->connection->log, 0, + "zstd free: %p", address); + +#endif +} + + +static char * +ngx_http_zstd_comp_level(ngx_conf_t *cf, void *post, void *data) +{ + ngx_int_t *np = data; + + if (*np == 0 || *np < (ngx_int_t)ZSTD_minCLevel() || *np > ZSTD_maxCLevel()) { + ngx_conf_log_error(NGX_LOG_EMERG, cf, 0, + "zstd compress level must between %i and %i excluding 0", + (ngx_int_t)ZSTD_minCLevel(), ZSTD_maxCLevel()); + + return NGX_CONF_ERROR; + } + + return NGX_CONF_OK; +} + +static char * +ngx_conf_zstd_set_num_slot_with_negatives(ngx_conf_t *cf, ngx_command_t *cmd, void *conf) +{ + char *p = conf; + + ngx_int_t *np; + ngx_str_t *value; + ngx_conf_post_t *post; + + + np = (ngx_int_t *) (p + cmd->offset); + + if (*np != NGX_CONF_UNSET) { + return "is duplicate"; + } + + value = cf->args->elts; + + if (*(value[1].data) == '-') { + // Parse ignoring the leading '-' character + *np = ngx_atoi(value[1].data + 1, value[1].len - 1); + + // NGX_ERROR is -1 so we need to check for that before making the parsed + // result negative + if (*np == NGX_ERROR) { + return "invalid number"; + } + + *np = -*np; + } else { + *np = ngx_atoi(value[1].data, value[1].len); + + if (*np == NGX_ERROR) { + return "invalid number"; + } + } + + if (cmd->post) { + post = cmd->post; + return post->post_handler(cf, post, np); + } + + return NGX_CONF_OK; +} diff --git a/src/deps/src/zstd-nginx-module/static/config b/src/deps/src/zstd-nginx-module/static/config new file mode 100644 index 000000000..ed6e66ee3 --- /dev/null +++ b/src/deps/src/zstd-nginx-module/static/config @@ -0,0 +1,111 @@ +ngx_feature_incs="#include " +ngx_feature_test="(void) ZSTD_createCCtx();" +ngx_feature_libs= +ngx_feature_run=yes + +ngx_zstd_opt_I= +ngx_zstd_opt_L= + +if [ -n "$ZSTD_INC" -o -n "$ZSTD_LIB" ]; then + ngx_feature="ZStandard static library in $ZSTD_INC and $ZSTD_LIB" + ngx_feature_path=$ZSTD_INC + + # we try the static shared library firstly + ngx_zstd_opt_I="-I$ZSTD_INC -DZSTD_STATIC_LINKING_ONLY" + ngx_zstd_opt_L="$ZSTD_LIB/libzstd.a" + SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS + CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS" + SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT + NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT" + + . auto/feature + + # restore + CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS + NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT + + if [ $ngx_found = no ]; then + # then try the dynamic shared library + ngx_feature="ZStandard dynamic library in $ZSTD_INC and $ZSTD_LIB" + ngx_zstd_opt_L="-L$ZSTD_LIB -lzstd -Wl,-rpath, $ZSTD_LIB" + + SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS + CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS" + SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT + NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT" + + . auto/feature + + # restore + CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS + NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT + + if [ $ngx_found = no ]; then + cat << END + $0: error: ngx_http_zstd_filter_module requires the ZStandard library, please be sure that "\$ZSTD_INC" and "\$ZSTD_LIB" are set correctly. +END + exit 1 + fi + + fi +else + # auto-discovery + ngx_feature="ZStandard static library" + ngx_zstd_opt_I="-DZSTD_STATIC_LINKING_ONLY" + ngx_zstd_opt_L="-l:libzstd.a" + + # still we consider the static library firstly + SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS + CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS" + SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT + NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT" + + . auto/feature + + # restore + CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS + NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT + + if [ $ngx_found = no ]; then + + ngx_feature="ZStandard dynamic library" + ngx_zstd_opt_L="-lzstd" + SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS + CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS" + SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT + NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT" + + . auto/feature + + if [ $ngx_found = no ]; then + cat << END + $0: error: ngx_http_zstd_filter_module requires the ZStandard library. +END + exit 1 + fi + + # restore + CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS + NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT + + cat << END + $0: warning: ngx_http_zstd_filter_module uses advanced ZStandard APIs (which are still considered experimental) while you are trying to link the dynamic shared library. +END + fi + + # TODO we need more tries for the different OS port. +fi + +CFLAGS="$ngx_zstd_opt_I $CFLAGS" +NGX_LD_OPT="$ngx_zstd_opt_L $NGX_LD_OPT" + +# build the ngx_http_zstd_static_module +HTTP_ZSTD_SRCS="$ngx_addon_dir/static/ngx_http_zstd_static_module.c" + +ngx_addon_name=ngx_http_zstd_static_module +ngx_module_type=HTTP +ngx_module_name=ngx_http_zstd_static_module +ngx_module_incs="$ngx_zstd_opt_I" +ngx_module_srcs=$HTTP_ZSTD_SRCS + +. auto/module diff --git a/src/deps/src/zstd-nginx-module/static/ngx_http_zstd_static_module.c b/src/deps/src/zstd-nginx-module/static/ngx_http_zstd_static_module.c new file mode 100644 index 000000000..3b247e95d --- /dev/null +++ b/src/deps/src/zstd-nginx-module/static/ngx_http_zstd_static_module.c @@ -0,0 +1,383 @@ + +/* + * Copyright (C) Alex Zhang + */ + + +#include +#include +#include + + +#define NGX_HTTP_ZSTD_STATIC_OFF 0 +#define NGX_HTTP_ZSTD_STATIC_ON 1 +#define NGX_HTTP_ZSTD_STATIC_ALWAYS 2 + + +typedef struct { + ngx_uint_t enable; +} ngx_http_zstd_static_conf_t; + + +static ngx_conf_enum_t ngx_http_zstd_static[] = { + { ngx_string("off"), NGX_HTTP_ZSTD_STATIC_OFF }, + { ngx_string("on"), NGX_HTTP_ZSTD_STATIC_ON }, + { ngx_string("always"), NGX_HTTP_ZSTD_STATIC_ALWAYS }, +}; + + +static ngx_command_t ngx_http_zstd_static_commands[] = { + + { ngx_string("zstd_static"), + NGX_HTTP_MAIN_CONF|NGX_HTTP_SRV_CONF|NGX_HTTP_LOC_CONF|NGX_CONF_TAKE1, + ngx_conf_set_enum_slot, + NGX_HTTP_LOC_CONF_OFFSET, + offsetof(ngx_http_zstd_static_conf_t, enable), + &ngx_http_zstd_static }, + + ngx_null_command +}; + + +static ngx_int_t ngx_http_zstd_static_handler(ngx_http_request_t *r); +static ngx_int_t ngx_http_zstd_accept_encoding(ngx_str_t *ae); +static ngx_int_t ngx_http_zstd_ok(ngx_http_request_t *r); +static void * ngx_http_zstd_static_create_loc_conf(ngx_conf_t *cf); +static char * ngx_http_zstd_static_merge_loc_conf(ngx_conf_t *cf, void *parent, + void *child); +static ngx_int_t ngx_http_zstd_static_init(ngx_conf_t *cf); + + +static ngx_http_module_t ngx_http_zstd_static_module_ctx = { + NULL, /* preconfiguration */ + ngx_http_zstd_static_init, /* postconfiguration */ + + NULL, /* create main configuration */ + NULL, /* init main configuration */ + + NULL, /* create server configuration */ + NULL, /* merge server configuration */ + + ngx_http_zstd_static_create_loc_conf, /* create location configuration */ + ngx_http_zstd_static_merge_loc_conf, /* merge location configuration */ +}; + + +ngx_module_t ngx_http_zstd_static_module = { + NGX_MODULE_V1, + &ngx_http_zstd_static_module_ctx, /* module context */ + ngx_http_zstd_static_commands, /* module directives */ + NGX_HTTP_MODULE, /* module type */ + NULL, /* init master */ + NULL, /* init module */ + NULL, /* init process */ + NULL, /* init thread */ + NULL, /* exit thread */ + NULL, /* exit process */ + NULL, /* exit master */ + NGX_MODULE_V1_PADDING +}; + + +static ngx_int_t +ngx_http_zstd_static_handler(ngx_http_request_t *r) +{ + u_char *p; + ngx_int_t rc; + ngx_uint_t level; + size_t root; + ngx_str_t path; + ngx_buf_t *b; + ngx_log_t *log; + ngx_table_elt_t *h; + ngx_chain_t out; + ngx_open_file_info_t of; + ngx_http_core_loc_conf_t *clcf; + ngx_http_zstd_static_conf_t *zscf; + + if (!(r->method & (NGX_HTTP_GET|NGX_HTTP_HEAD))) { + return NGX_DECLINED; + } + + if (r->uri.data[r->uri.len - 1] == '/') { + return NGX_DECLINED; + } + + zscf = ngx_http_get_module_loc_conf(r, ngx_http_zstd_static_module); + + if (zscf->enable == NGX_HTTP_ZSTD_STATIC_OFF) { + return NGX_DECLINED; + } + + if (zscf->enable == NGX_HTTP_ZSTD_STATIC_ON) { + rc = ngx_http_zstd_ok(r); + + } else { + rc = NGX_OK; + } + + clcf = ngx_http_get_module_loc_conf(r, ngx_http_core_module); + + if (!clcf->gzip_vary && rc != NGX_OK) { + return NGX_DECLINED; + } + + log = r->connection->log; + + p = ngx_http_map_uri_to_path(r, &path, &root, sizeof(".zst") - 1); + if (p == NULL) { + return NGX_HTTP_INTERNAL_SERVER_ERROR; + } + + *p++ = '.'; + *p++ = 'z'; + *p++ = 's'; + *p++ = 't'; + *p = '\0'; + + path.len = p - path.data; + + ngx_log_debug1(NGX_LOG_DEBUG_HTTP, log, 0, + "http filename: \"%s\"", path.data); + + ngx_memzero(&of, sizeof(ngx_open_file_info_t)); + + of.read_ahead = clcf->read_ahead; + of.directio = clcf->directio; + of.valid = clcf->open_file_cache_valid; + of.min_uses = clcf->open_file_cache_min_uses; + of.errors = clcf->open_file_cache_errors; + of.events = clcf->open_file_cache_events; + + if (ngx_http_set_disable_symlinks(r, clcf, &path, &of) != NGX_OK) { + return NGX_HTTP_INTERNAL_SERVER_ERROR; + } + + if (ngx_open_cached_file(clcf->open_file_cache, &path, &of, r->pool) + != NGX_OK) + { + switch (of.err) { + + case 0: + return NGX_HTTP_INTERNAL_SERVER_ERROR; + + case NGX_ENOENT: + case NGX_ENOTDIR: + case NGX_ENAMETOOLONG: + + return NGX_DECLINED; + + case NGX_EACCES: +#if (NGX_HAVE_OPENAT) + case NGX_EMLINK: + case NGX_ELOOP: +#endif + + level = NGX_LOG_ERR; + break; + + default: + + level = NGX_LOG_CRIT; + break; + } + + ngx_log_error(level, log, of.err, + "%s \"%s\" failed", of.failed, path.data); + + return NGX_DECLINED; + } + + if (zscf->enable == NGX_HTTP_ZSTD_STATIC_ON) { + r->gzip_vary = 1; + + if (rc != NGX_OK) { + return NGX_DECLINED; + } + } + + ngx_log_debug1(NGX_LOG_DEBUG_HTTP, log, 0, "http static fd: %d", of.fd); + + if (of.is_dir) { + ngx_log_debug0(NGX_LOG_DEBUG_HTTP, log, 0, "http dir"); + return NGX_DECLINED; + } + +#if !(NGX_WIN32) /* the not regular files are probably Unix specific */ + + if (!of.is_file) { + ngx_log_error(NGX_LOG_CRIT, log, 0, + "\"%s\" is not a regular file", path.data); + + return NGX_HTTP_NOT_FOUND; + } + +#endif + + r->root_tested = !r->error_page; + + rc = ngx_http_discard_request_body(r); + if (rc != NGX_OK) { + return rc; + } + + log->action = "sending response to client"; + + r->headers_out.status = NGX_HTTP_OK; + r->headers_out.content_length_n = of.size; + r->headers_out.last_modified_time = of.mtime; + + if (ngx_http_set_etag(r) != NGX_OK) { + return NGX_HTTP_INTERNAL_SERVER_ERROR; + } + + if (ngx_http_set_content_type(r) != NGX_OK) { + return NGX_HTTP_INTERNAL_SERVER_ERROR; + } + + h = ngx_list_push(&r->headers_out.headers); + if (h == NULL) { + return NGX_HTTP_INTERNAL_SERVER_ERROR; + } + + h->hash = 1; + ngx_str_set(&h->key, "Content-Encoding"); + ngx_str_set(&h->value, "zstd"); + r->headers_out.content_encoding = h; + + b = ngx_calloc_buf(r->pool); + if (b == NULL) { + return NGX_HTTP_INTERNAL_SERVER_ERROR; + } + + b->file = ngx_pcalloc(r->pool, sizeof(ngx_file_t)); + if (b->file == NULL) { + return NGX_HTTP_INTERNAL_SERVER_ERROR; + } + + rc = ngx_http_send_header(r); + + if (rc == NGX_ERROR || rc > NGX_OK || r->header_only) { + return rc; + } + + b->file_pos = 0; + b->file_last = of.size; + + b->in_file = b->file_last ? 1 : 0; + b->last_buf = (r == r->main) ? 1 : 0; + b->last_in_chain = 1; + + b->file->fd = of.fd; + b->file->name = path; + b->file->log = log; + b->file->directio = of.is_directio; + + out.buf = b; + out.next = NULL; + + return ngx_http_output_filter(r, &out); +} + + +static ngx_int_t +ngx_http_zstd_ok(ngx_http_request_t *r) +{ + ngx_table_elt_t *ae; + + if (r != r->main) { + return NGX_DECLINED; + } + + ae = r->headers_in.accept_encoding; + if (ae == NULL) { + return NGX_DECLINED; + } + + if (ae->value.len < sizeof("zstd") - 1) { + return NGX_DECLINED; + } + + if (ngx_memcmp(ae->value.data, "zstd", 4) != 0 + && ngx_http_zstd_accept_encoding(&ae->value) != NGX_OK) + { + return NGX_DECLINED; + } + + + r->gzip_tested = 1; + r->gzip_ok = 0; + + return NGX_OK; +} + + +static ngx_int_t +ngx_http_zstd_accept_encoding(ngx_str_t *ae) +{ + u_char *p; + + p = ngx_strcasestrn(ae->data, "zstd", sizeof("zstd") - 1); + if (p == NULL) { + return NGX_DECLINED; + } + + if (p == ae->data || (*(p - 1) == ',' || *(p - 1) == ' ')) { + + p += sizeof("zstd") - 1; + + if (p == ae->data + ae->len || *p == ',' || *p == ' ' || *p == ';') { + return NGX_OK; + } + } + + return NGX_DECLINED; +} + + +static void * +ngx_http_zstd_static_create_loc_conf(ngx_conf_t *cf) +{ + ngx_http_zstd_static_conf_t *conf; + + conf = ngx_palloc(cf->pool, sizeof(ngx_http_zstd_static_conf_t)); + if (conf == NULL) { + return NULL; + } + + conf->enable = NGX_CONF_UNSET_UINT; + + return conf; +} + + +static char * +ngx_http_zstd_static_merge_loc_conf(ngx_conf_t *cf, void *parent, void *child) +{ + ngx_http_zstd_static_conf_t *prev = parent; + ngx_http_zstd_static_conf_t *conf = child; + + ngx_conf_merge_uint_value(conf->enable, prev->enable, + NGX_HTTP_ZSTD_STATIC_OFF); + + return NGX_CONF_OK; +} + + +static ngx_int_t +ngx_http_zstd_static_init(ngx_conf_t *cf) +{ + ngx_http_handler_pt *h; + ngx_http_core_main_conf_t *cmcf; + + cmcf = ngx_http_conf_get_module_main_conf(cf, ngx_http_core_module); + + h = ngx_array_push(&cmcf->phases[NGX_HTTP_CONTENT_PHASE].handlers); + if (h == NULL) { + return NGX_ERROR; + } + + *h = ngx_http_zstd_static_handler; + + return NGX_OK; +} diff --git a/src/deps/src/zstd-nginx-module/t/00-filter.t b/src/deps/src/zstd-nginx-module/t/00-filter.t new file mode 100644 index 000000000..1e4809807 --- /dev/null +++ b/src/deps/src/zstd-nginx-module/t/00-filter.t @@ -0,0 +1,8 @@ +use Test::Nginx::Socket::Lua; + +no_long_string(); +run_tests(); + +__DATA__ + + diff --git a/src/deps/src/zstd-nginx-module/t/01-static.t b/src/deps/src/zstd-nginx-module/t/01-static.t new file mode 100644 index 000000000..74fa13076 --- /dev/null +++ b/src/deps/src/zstd-nginx-module/t/01-static.t @@ -0,0 +1,198 @@ +use Test::Nginx::Socket; +use lib 'lib'; + +no_long_string(); +log_level 'debug'; +repeat_each(3); +plan tests => repeat_each() * ((blocks() - 3) * 5 + 3); +run_tests(); + + +__DATA__ + + +=== TEST 1: zstd_static off +--- config + location /test { + zstd_static off; + root ../../t/suite; + } +--- request +GET /test +--- response_headers +Content-Length: 59738 +ETag: "5be17d33-e95a" +!Content-Encoding +--- no_error_log +[error] + + + +=== TEST 2: zstd_static off (with accept-encoding header) +--- config + location /test { + zstd_static off; + root ../../t/suite; + } +--- request +GET /test +Accept-Encoding: gzip,zstd +--- response_headers +Content-Length: 59738 +ETag: "5be17d33-e95a" +!Content-Encoding +--- no_error_log +[error] + + + +=== TEST 3: zstd_static on +--- config + location /test { + zstd_static on; + root ../../t/suite; + } +--- request +GET /test +--- more_headers +Accept-Encoding: gzip, zstd +--- response_headers +Content-Length: 20706 +ETag: "5be17d33-50e2" +!Content-Encoding +Content-Encoding: zstd +--- no_error_log +[error] + + + +=== TEST 4: zstd_static on (without accept-encoding header) +--- config + location /test { + zstd_static on; + root ../../t/suite; + } +--- request +GET /test +--- response_headers +Content-Length: 59738 +ETag: "5be17d33-e95a" +Content-Encoding: zstd +!Content-Encoding +--- no_error_log +[error] + + + +=== TEST 5: zstd_static on (without zstd component in accept-encoding header) +--- config + location /test { + zstd_static on; + root ../../t/suite; + } +--- request +GET /test +--- more_headers +Accept-Encoding: gzip, br +--- response_headers +Content-Length: 59738 +ETag: "5be17d33-e95a" +!Content-Encoding +--- no_error_log +[error] + + + +=== TEST 6: zstd_static always +--- config + location /test { + zstd_static always; + root ../../t/suite; + } +--- request +GET /test +--- more_headers +Accept-Encoding: gzip, br +--- response_headers +Content-Length: 20706 +ETag: "5be17d33-50e2" +Content-Encoding: zstd +--- no_error_log +[error] + + + +=== TEST 6: zstd_static always (without accept-encoding header) +--- config + location /test { + zstd_static always; + root ../../t/suite; + } +--- request +GET /test +--- response_headers +Content-Length: 20706 +ETag: "5be17d33-50e2" +Content-Encoding: zstd +--- no_error_log +[error] + + + +=== TEST 7: zstd_static always (without zstd component in accept-encoding header) +--- config + location /test { + zstd_static always; + root ../../t/suite; + } +--- request +GET /test +--- more_headers +Accept-Encoding: gzip, br +--- response_headers +Content-Length: 20706 +ETag: "5be17d33-50e2" +Content-Encoding: zstd +--- no_error_log +[error] + + +=== TEST 8: zstd_static always (file does not exist) +--- config + location /test2 { + zstd_static always; + root ../../t/suite; + } +--- request +GET /test2 +--- more_headers +Accept-Encoding: gzip, br +--- error_code: 404 + + + +=== TEST 9: zstd_static on (file does not exist) +--- config + location /test2 { + zstd_static on; + root ../../t/suite; + } +--- request +GET /test2 +--- more_headers +Accept-Encoding: gzip, br +--- error_code: 404 + + + +=== TEST 10: zstd_static off (file does not exist) +--- config + location /test2 { + zstd_static off; + root ../../t/suite; + } +--- request +GET /test2 +--- more_headers +Accept-Encoding: gzip, br +--- error_code: 404 diff --git a/src/deps/src/zstd-nginx-module/t/suite/test b/src/deps/src/zstd-nginx-module/t/suite/test new file mode 100644 index 000000000..53ce3fa9b --- /dev/null +++ b/src/deps/src/zstd-nginx-module/t/suite/test @@ -0,0 +1,2040 @@ + + + + +Regular Expression Matching Can Be Simple And Fast + + + + +

+Regular Expression Matching Can Be Simple And Fast +
+(but is slow in Java, Perl, PHP, Python, Ruby, ...) +

+

+Russ Cox +
+rsc@swtch.com +
+January 2007 +
+ +

+ + +

Introduction

+ +

+This is a tale of two approaches to regular expression matching. +One of them is in widespread use in the +standard interpreters for many languages, including Perl. +The other is used only in a few places, notably most implementations +of awk and grep. +The two approaches have wildly different +performance characteristics: +

+ +
+
+ +
Perl graphThompson NFA graph +
+
+Time to match a?nan against an +
+
+
+ +

+Let's use superscripts to denote string repetition, +so that +a?3a3 +is shorthand for +a?a?a?aaa. +The two graphs plot the time required by each approach +to match the regular expression +a?nan +against the string an. +

+ +

+Notice that Perl requires over sixty seconds to match +a 29-character string. +The other approach, labeled Thompson NFA for +reasons that will be explained later, +requires twenty microseconds to match the string. +That's not a typo. The Perl graph plots time in seconds, +while the Thompson NFA graph plots time in microseconds: +the Thompson NFA implementation +is a million times faster than Perl +when running on a miniscule 29-character string. +The trends shown in the graph continue: the +Thompson NFA handles a 100-character string in under 200 microseconds, +while Perl would require over 1015 years. +(Perl is only the most conspicuous example of a large +number of popular programs that use the same algorithm; +the above graph could have been Python, or PHP, or Ruby, +or many other languages. A more detailed +graph later in this article presents data for other implementations.) +

+ +

+It may be hard to believe the graphs: perhaps you've used Perl, +and it never seemed like regular expression matching was +particularly slow. +Most of the time, in fact, regular expression matching in Perl +is fast enough. +As the graph shows, though, it is possible +to write so-called “pathological” regular expressions that +Perl matches very very slowly. +In contrast, there are no regular expressions that are +pathological for the Thompson NFA implementation. +Seeing the two graphs side by side prompts the question, +“why doesn't Perl use the Thompson NFA approach?” +It can, it should, and that's what the rest of this article is about. +

+ +

+Historically, regular expressions are one of computer science's +shining examples of how using good theory leads to good programs. +They were originally developed by theorists as a +simple computational model, +but Ken Thompson introduced them to +programmers in his implementation of the text editor QED +for CTSS. +Dennis Ritchie followed suit in his own implementation +of QED, for GE-TSS. +Thompson and Ritchie would go on to create Unix, +and they brought regular expressions with them. +By the late 1970s, regular expressions were a key +feature of the Unix landscape, in tools such as +ed, sed, grep, egrep, awk, and lex. +

+ +

+Today, regular expressions have also become a shining +example of how ignoring good theory leads to bad programs. +The regular expression implementations used by +today's popular tools are significantly slower +than the ones used in many of those thirty-year-old Unix tools. +

+ +

+This article reviews the good theory: +regular expressions, finite automata, +and a regular expression search algorithm +invented by Ken Thompson in the mid-1960s. +It also puts the theory into practice, describing +a simple implementation of Thompson's algorithm. +That implementation, less than 400 lines of C, +is the one that went head to head with Perl above. +It outperforms the more complex real-world +implementations used by +Perl, Python, PCRE, and others. +The article concludes with a discussion of how +theory might yet be converted into practice +in the real-world implementations. +

+ +

+Regular Expressions +

+ + +

+Regular expressions are a notation for +describing sets of character strings. +When a particular string is in the set +described by a regular expression, +we often say that the regular expression +matches +the string. +

+ +

+The simplest regular expression is a single literal character. +Except for the special metacharacters +*+?()|, +characters match themselves. +To match a metacharacter, escape it with +a backslash: +\+ +matches a literal plus character. +

+ +

+Two regular expressions can be alternated or concatenated to form a new +regular expression: +if e1 matches +s +and e2 matches +t, +then e1|e2 matches +s +or +t, +and +e1e2 +matches +st. +

+ +

+The metacharacters +*, ++, +and +? +are repetition operators: +e1* +matches a sequence of zero or more (possibly different) +strings, each of which match e1; +e1+ +matches one or more; +e1? +matches zero or one. +

+ +

+The operator precedence, from weakest to strongest binding, is +first alternation, then concatenation, and finally the +repetition operators. +Explicit parentheses can be used to force different meanings, +just as in arithmetic expressions. +Some examples: +ab|cd +is equivalent to +(ab)|(cd); +ab* +is equivalent to +a(b*). +

+ +

+The syntax described so far is a subset of the traditional Unix +egrep +regular expression syntax. +This subset suffices to describe all regular +languages: loosely speaking, a regular language is a set +of strings that can be matched in a single pass through +the text using only a fixed amount of memory. +Newer regular expression facilities (notably Perl and +those that have copied it) have added +many new operators +and escape sequences. These additions make the regular +expressions more concise, and sometimes more cryptic, but usually +not more powerful: +these fancy new regular expressions almost always have longer +equivalents using the traditional syntax. +

+ +

+One common regular expression extension that +does provide additional power is called +backreferences. +A backreference like +\1 +or +\2 +matches the string matched +by a previous parenthesized expression, and only that string: +(cat|dog)\1 +matches +catcat +and +dogdog +but not +catdog +nor +dogcat. +As far as the theoretical term is concerned, +regular expressions with backreferences +are not regular expressions. +The power that backreferences add comes at great cost: +in the worst case, the best known implementations require +exponential search algorithms, +like the one Perl uses. +Perl (and the other languages) +could not now remove backreference support, +of course, but they could employ much faster algorithms +when presented with regular expressions that don't have +backreferences, like the ones considered above. +This article is about those faster algorithms. +

+ +

+Finite Automata +

+ + + +

+Another way to describe sets of character strings is with +finite automata. +Finite automata are also known as state machines, +and we will use “automaton” and “machine” interchangeably. +

+ +

+As a simple example, here is a machine recognizing +the set of strings matched by the regular expression +a(bb)+a: +

+ +

DFA for a(bb)+a

+ +

+A finite automaton is always in one of its states, +represented in the diagram by circles. +(The numbers inside the circles are labels to make this +discussion easier; they are not part of the machine's operation.) +As it reads the string, it switches from state to state. +This machine has two special states: the start state s0 +and the matching state s4. +Start states are depicted with lone arrowheads pointing at them, +and matching states are drawn as a double circle. +

+ +

+The machine reads an input string one character at a time, +following arrows corresponding to the input to move from +state to state. +Suppose the input string is +abbbba. +When the machine reads the first letter of the string, the +a, +it is in the start state s0. It follows the +a +arrow to state s1. +This process repeats as the machine reads the rest of the string: +b +to +s2, +b +to +s3, +b +to +s2, +b +to +s3, +and finally +a +to +s4. +

+

DFA execution on abbbba

+

+The machine ends in s4, a matching state, so it +matches the string. +If the machine ends in a non-matching state, it does not +match the string. +If, at any point during the machine's execution, there is no +arrow for it to follow corresponding to the current +input character, the machine stops executing early. +

+ +

+The machine we have been considering is called a +deterministic +finite automaton (DFA), +because in any state, each possible input letter +leads to at most one new state. +We can also create machines +that must choose between multiple possible next states. +For example, this machine is equivalent to the previous +one but is not deterministic: +

+

NFA for a(bb)+a

+

+The machine is not deterministic because if it reads a +b +in state s2, it has multiple choices for the next state: +it can go back to s1 in hopes of seeing another +bb, +or it can go on to s3 in hopes of seeing the final +a. +Since the machine cannot peek ahead to see the rest of +the string, it has no way to know which is the correct decision. +In this situation, it turns out to be interesting to +let the machine +always guess correctly. +Such machines are called non-deterministic finite automata +(NFAs or NDFAs). +An NFA matches an input string if there is some way +it can read the string and follow arrows to a matching state. +

+ +

+Sometimes it is convenient to let NFAs have arrows with no +corresponding input character. We will leave these arrows unlabeled. +An NFA can, at any time, choose to follow an unlabeled arrow +without reading any input. +This NFA is equivalent to the previous two, but the unlabeled arrow +makes the correspondence with +a(bb)+a +clearest: +

+

Another NFA for a(bb)+a

+ +

+Converting Regular Expressions to NFAs +

+ +

+Regular expressions and NFAs turn out to be exactly +equivalent in power: every regular expression has an +equivalent NFA (they match the same strings) and vice versa. +(It turns out that DFAs are also equivalent in power +to NFAs and regular expressions; we will see this later.) +There are multiple ways to translate regular expressions into NFAs. +The method described here was first described by Thompson +in his 1968 CACM paper. +

+ +

+The NFA for a regular expression is built up from partial NFAs +for each subexpression, with a different construction for +each operator. The partial NFAs have +no matching states: instead they have one or more dangling arrows, +pointing to nothing. The construction process will finish by +connecting these arrows to a matching state. +

+ +

+The NFAs for matching single characters look like: +

+

Single-character NFA

+

+The NFA for the concatenation e1e2 +connects the final arrow of the e1 +machine to the start of the e2 machine: +

+

Concatenation NFA

+

+The NFA for the alternation e1|e2 +adds a new start state with a choice of either the +e1 machine or the e2 machine. +

+

Alternation NFA

+

+The NFA for e? alternates the e machine with an empty path: +

+

Zero or one NFA

+

+The NFA for e* uses the same alternation but loops a +matching e machine back to the start: +

+

Zero or more NFA

+

+The NFA for e+ also creates a loop, but one that +requires passing through e at least once: +

+

One or more NFA

+ +

+Counting the new states in the diagrams above, we can see +that this technique creates exactly one state per character +or metacharacter in the regular expression, +excluding parentheses. +Therefore the number of states in the final NFA is at most +equal to the length of the original regular expression. +

+ +

+Just as with the example NFA discussed earlier, it is always possible +to remove the unlabeled arrows, and it is also always possible to generate +the NFA without the unlabeled arrows in the first place. +Having the unlabeled arrows makes the NFA easier for us to read +and understand, and they also make the C representation +simpler, so we will keep them. +

+ +

+Regular Expression Search Algorithms +

+ +

+Now we have a way to test whether a regular expression +matches a string: convert the regular expression to an NFA +and then run the NFA using the string as input. +Remember that NFAs are endowed with the ability to guess +perfectly when faced with a choice of next state: +to run the NFA using an ordinary computer, we must find +a way to simulate this guessing. +

+ +

+One way to simulate perfect guessing is to guess +one option, and if that doesn't work, try the other. +For example, consider the NFA for +abab|abbb +run on the string +abbb: +

+

NFA for abab|abbb

+

Backtracking execution on abbb

+

+At step 0, the NFA must make a choice: try to match +abab +or +try to match +abbb? +In the diagram, the NFA tries +abab, +but that fails after step 3. +The NFA then tries the other choice, leading to step 4 and eventually a match. +This backtracking approach +has a simple recursive implementation +but can read the input string many times +before succeeding. +If the string does not match, +the machine must try +all +possible execution paths before +giving up. +The NFA tried only two different paths in the example, +but in the worst case, there can be exponentially +many possible execution paths, leading to very slow run times. +

+ +

+A more efficient but more complicated way to simulate perfect +guessing is to guess both options simultaneously. +In this approach, the simulation allows the machine +to be in multiple states at once. To process each letter, +it advances all the states along all the arrows that +match the letter. +

+

Parallel execution on abbb

+

+The machine starts in the start state and all the states +reachable from the start state by unlabeled arrows. +In steps 1 and 2, the NFA is in two states simultaneously. +Only at step 3 does the state set narrow down to a single state. +This multi-state approach tries both paths at the same time, +reading the input only once. +In the worst case, the NFA might be in +every +state at each step, but this results in at worst a constant amount +of work independent of the length of the string, +so arbitrarily +large input strings can be processed in linear time. +This is a dramatic improvement over the exponential time +required by the backtracking approach. +The efficiency comes from tracking the set of reachable +states but +not +which paths were used to reach them. +In an NFA with +n +nodes, there can only be +n +reachable states at any step, but there might be +2n paths through the NFA. +

+ +

+Implementation +

+ +

+Thompson introduced the multiple-state simulation approach +in his 1968 paper. +In his formulation, the states of the NFA were represented +by small machine-code sequences, and the list of possible states +was just a sequence of function call instructions. +In essence, Thompson compiled the regular expression into clever +machine code. +Forty years later, computers are much faster and the +machine code approach is not as necessary. +The following sections +present an implementation written in portable ANSI C. +The full source code (under 400 lines) +and the benchmarking scripts are +available online. +(Readers who are unfamiliar or uncomfortable with C or pointers should +feel free to read the descriptions and skip over the actual code.) +

+ +

+Implementation: Compiling to NFA +

+ +

+The first step is to compile the regular expression +into an equivalent NFA. +In our C program, we will represent an NFA as a +linked collection of +State +structures: +

+
+struct State
+{
+	int c;
+	State *out;
+	State *out1;
+	int lastlist;
+};
+

+Each +State +represents one of the following three NFA fragments, +depending on the value of +c. +

+

Possible per-State NFA fragments

+

+(Lastlist +is used during execution and is explained in the next section.) +

+ +

+Following Thompson's paper, +the compiler builds an NFA from a regular expression in +postfix +notation with dot +(.) added +as an explicit concatenation operator. +A separate function +re2post +rewrites infix regular expressions like +“a(bb)+a” +into equivalent postfix expressions like +“abb.+.a.”. +(A “real” implementation would certainly +need to use dot as the “any character” metacharacter +rather than as a concatenation operator. +A real implementation would also probably build the +NFA during parsing rather than build an explicit postfix expression. +However, the postfix version is convenient and follows +Thompson's paper more closely.) +

+ +

+As the compiler scans the postfix expression, it maintains +a stack of computed NFA fragments. +Literals push new NFA fragments onto the stack, while +operators pop fragments off the stack and then +push a new fragment. +For example, +after compiling the +abb in abb.+.a., +the stack contains NFA fragments for +a, +b, +and +b. +The compilation of the +. +that follows pops the two +b +NFA fragment from the stack and pushes an NFA fragment for the +concatenation +bb.. +Each NFA fragment is defined by its start state and its +outgoing arrows: +

+struct Frag
+{
+	State *start;
+	Ptrlist *out;
+};
+

+Start +points at the start state for the fragment, +and +out +is a list of pointers to +State* +pointers that are not yet connected to anything. +These are the dangling arrows in the NFA fragment. +

+ +

+Some helper functions manipulate pointer lists: +

+Ptrlist *list1(State **outp);
+Ptrlist *append(Ptrlist *l1, Ptrlist *l2);
+
+void patch(Ptrlist *l, State *s);
+

+List1 +creates a new pointer list containing the single pointer +outp. +Append +concatenates two pointer lists, returning the result. +Patch +connects the dangling arrows in the pointer list +l +to the state +s: +it sets +*outp += +s +for each pointer +outp +in +l. +

+ +

+Given these primitives and a fragment stack, +the compiler is a simple loop over the postfix expression. +At the end, there is a single fragment left: +patching in a matching state completes the NFA. +

+State*
+post2nfa(char *postfix)
+{
+	char *p;
+	Frag stack[1000], *stackp, e1, e2, e;
+	State *s;
+
+	#define push(s) *stackp++ = s
+	#define pop()   *--stackp
+
+	stackp = stack;
+	for(p=postfix; *p; p++){
+		switch(*p){
+		/* compilation cases, described below */
+		}
+	}
+	
+	e = pop();
+	patch(e.out, matchstate);
+	return e.start;
+}
+

+The specific compilation cases mimic the translation +steps described earlier. +

+ + +

+Literal characters: +

+default:
+	s = state(*p, NULL, NULL);
+	push(frag(s, list1(&s->out));
+	break;
+
+
+ +

+Catenation: +

+case '.':
+	e2 = pop();
+	e1 = pop();
+	patch(e1.out, e2.start);
+	push(frag(e1.start, e2.out));
+	break;
+
+
+ +

+Alternation: +

+case '|':
+	e2 = pop();
+	e1 = pop();
+	s = state(Split, e1.start, e2.start);
+	push(frag(s, append(e1.out, e2.out)));
+	break;
+
+
+ +

+Zero or one: +

+case '?':
+	e = pop();
+	s = state(Split, e.start, NULL);
+	push(frag(s, append(e.out, list1(&s->out1))));
+	break;
+
+
+ +

+Zero or more: +

+case '*':
+	e = pop();
+	s = state(Split, e.start, NULL);
+	patch(e.out, s);
+	push(frag(s, list1(&s->out1)));
+	break;
+
+
+ +

+One or more: +

+case '+':
+	e = pop();
+	s = state(Split, e.start, NULL);
+	patch(e.out, s);
+	push(frag(e.start, list1(&s->out1)));
+	break;
+
+
+
+ +

+Implementation: Simulating the NFA +

+ +

+Now that the NFA has been built, we need to simulate it. +The simulation requires tracking +State +sets, which are stored as a simple array list: +

+struct List
+{
+	State **s;
+	int n;
+};
+

+The simulation uses two lists: +clist +is the current set of states that the NFA is in, +and +nlist +is the next set of states that the NFA will be in, +after processing the current character. +The execution loop initializes +clist +to contain just the start state and then +runs the machine one step at a time. +

+int
+match(State *start, char *s)
+{
+	List *clist, *nlist, *t;
+
+	/* l1 and l2 are preallocated globals */
+	clist = startlist(start, &l1);
+	nlist = &l2;
+	for(; *s; s++){
+		step(clist, *s, nlist);
+		t = clist; clist = nlist; nlist = t;	/* swap clist, nlist */
+	}
+	return ismatch(clist);
+}
+

+To avoid allocating on every iteration of the loop, +match +uses two preallocated lists +l1 +and +l2 +as +clist +and +nlist, +swapping the two after each step. +

+ +

+If the final state list contains the matching state, +then the string matches. +

+int
+ismatch(List *l)
+{
+	int i;
+
+	for(i=0; i<l->n; i++)
+		if(l->s[i] == matchstate)
+			return 1;
+	return 0;
+}
+

+

+ +

+Addstate +adds a state to the list, +but not if it is already on the list. +Scanning the entire list for each add would be inefficient; +instead the variable +listid +acts as a list generation number. +When +addstate +adds +s +to a list, +it records +listid +in +s->lastlist. +If the two are already equal, +then +s +is already on the list being built. +Addstate +also follows unlabeled arrows: +if +s +is a +Split +state with two unlabeled arrows to new states, +addstate +adds those states to the list instead of +s. +

+void
+addstate(List *l, State *s)
+{
+	if(s == NULL || s->lastlist == listid)
+		return;
+	s->lastlist = listid;
+	if(s->c == Split){
+		/* follow unlabeled arrows */
+		addstate(l, s->out);
+		addstate(l, s->out1);
+		return;
+	}
+	l->s[l->n++] = s;
+}
+

+

+ +

+Startlist +creates an initial state list by adding just the start state: +

+List*
+startlist(State *s, List *l)
+{
+	listid++;
+	l->n = 0;
+	addstate(l, s);
+	return l;
+}
+

+

+ +

+Finally, +step +advances the NFA past a single character, using +the current list +clist +to compute the next list +nlist. +

+void
+step(List *clist, int c, List *nlist)
+{
+	int i;
+	State *s;
+
+	listid++;
+	nlist->n = 0;
+	for(i=0; i<clist->n; i++){
+		s = clist->s[i];
+		if(s->c == c)
+			addstate(nlist, s->out);
+	}
+}
+
+ +

+Performance +

+ +

+The C implementation just described was not written with performance in mind. +Even so, a slow implementation of a linear-time algorithm +can easily outperform a fast implementation of an +exponential-time algorithm once the exponent is large enough. +Testing a variety of popular regular expression engines on +a so-called pathological regular expression demonstrates this nicely. +

+ +

+Consider the regular expression +a?nan. +It matches the string +an +when the +a? +are chosen not to match any letters, +leaving the entire string to be matched by the +an. +Backtracking regular expression implementations +implement the zero-or-one +? +by first trying one and then zero. +There are +n +such choices to make, a total of +2n possibilities. +Only the very last +possibility—choosing zero for all the ?—will lead to a match. +The backtracking approach thus requires +O(2n) time, so it will not scale much beyond n=25. +

+ +

+In contrast, Thompson's algorithm maintains state lists of length +approximately n and processes the string, also of length n, +for a total of O(n2) time. +(The run time is superlinear, +because we are not keeping the regular expression constant +as the input grows. +For a regular expression of length m run on text of length n, +the Thompson NFA requires O(mn) time.) +

+ +

+The following graph plots time required to check whether +a?nan +matches +an: +

+ +
+
+
+
+
+Performance graph +
+regular expression and text size n +
+a?nan +matching +an +
+
+
+
+
+ +

+Notice that the graph's y-axis has a logarithmic scale, +in order to be able to see a wide variety of times on a single graph. +

+ +

+From the graph it is clear that Perl, PCRE, Python, and Ruby are +all using recursive backtracking. +PCRE stops getting the right answer at +n=23, +because it aborts the recursive backtracking after a maximum number +of steps. +As of Perl 5.6, Perl's regular expression engine is +said to memoize +the recursive backtracking search, which should, at some memory cost, +keep the search from taking exponential amounts of time +unless backreferences are being used. +As the performance graph shows, the memoization is not complete: +Perl's run time grows exponentially even though there +are no backreferences +in the expression. +Although not benchmarked here, Java uses a backtracking +implementation too. +In fact, the +java.util.regex +interface requires a backtracking +implementation, because arbitrary Java code +can be substituted into the matching path. +PHP uses the PCRE library. +

+ +

+The thick blue line is the C implementation of Thompson's algorithm given above. +Awk, Tcl, GNU grep, and GNU awk +build DFAs, either precomputing them or using the on-the-fly +construction described in the next section. +

+ +

+Some might argue that this test is unfair to +the backtracking implementations, since it focuses on an +uncommon corner case. +This argument misses the point: +given a choice between an implementation +with a predictable, consistent, fast running time on all inputs +or one that usually runs quickly but can take +years of CPU time (or more) on some inputs, +the decision should be easy. +Also, while examples as dramatic as this one +rarely occur in practice, less dramatic ones do occur. +Examples include using +(.*) +(.*) +(.*) +(.*) +(.*) +to split five space-separated fields, or using +alternations where the common cases +are not listed first. +As a result, programmers often learn which constructs are +expensive and avoid them, or they turn to so-called +optimizers. +Using Thompson's NFA simulation does not require such adaptation: +there are no expensive regular expressions. +

+ +

+Caching the NFA to build a DFA +

+ +

+Recall that DFAs are more efficient to execute than NFAs, +because DFAs are only ever in one state at a time: they never +have a choice of multiple next states. +Any NFA can be converted into an equivalent DFA +in which each DFA state corresponds to a +list of NFA states. +

+ +

+For example, here is the NFA we used earlier for +abab|abbb, +with state numbers added: +

+

NFA for abab|abbb

+

+The equivalent DFA would be: +

+

DFA for abab|abbb

+

+Each state in the DFA corresponds to a list of +states from the NFA. +

+ +

+In a sense, Thompson's NFA simulation is +executing the equivalent DFA: each +List +corresponds to some DFA state, +and the +step +function is computing, given a list and a next character, +the next DFA state to enter. +Thompson's algorithm simulates the DFA by +reconstructing each DFA state as it is needed. +Rather than throw away this work after each step, +we could cache the +Lists +in spare memory, avoiding the cost of repeating the computation +in the future +and essentially computing the equivalent DFA as it is needed. +This section presents the implementation of such an approach. +Starting with the NFA implementation from the previous section, +we need to add less than 100 lines to build a DFA implementation. +

+ +

+To implement the cache, we first introduce a new data type +that represents a DFA state: +

+struct DState
+{
+	List l;
+	DState *next[256];
+	DState *left;
+	DState *right;
+};
+

+A +DState +is the cached copy of the list +l. +The array +next +contains pointers to the next state for each +possible input character: +if the current state is +d +and the next input character is +c, +then +d->next[c] +is the next state. +If +d->next[c] +is null, then the next state has not been computed yet. +Nextstate +computes, records, and returns the next state +for a given state and character. +

+ +

+The regular expression match follows +d->next[c] +repeatedly, calling +nextstate +to compute new states as needed. +

+int
+match(DState *start, char *s)
+{
+	int c;
+	DState *d, *next;
+	
+	d = start;
+	for(; *s; s++){
+		c = *s & 0xFF;
+		if((next = d->next[c]) == NULL)
+			next = nextstate(d, c);
+		d = next;
+	}
+	return ismatch(&d->l);
+}
+

+

+ +

+All the +DStates +that have been computed need to be saved in a +structure that lets us look up a +DState +by its +List. +To do this, we arrange them +in a binary tree +using the sorted +List +as the key. +The +dstate +function returns the +DState +for a given +List, +allocating one if necessary: +

+DState*
+dstate(List *l)
+{
+	int i;
+	DState **dp, *d;
+	static DState *alldstates;
+
+	qsort(l->s, l->n, sizeof l->s[0], ptrcmp);
+
+	/* look in tree for existing DState */
+	dp = &alldstates;
+	while((d = *dp) != NULL){
+		i = listcmp(l, &d->l);
+		if(i < 0)
+			dp = &d->left;
+		else if(i > 0)
+			dp = &d->right;
+		else
+			return d;
+	}
+	
+	/* allocate, initialize new DState */
+	d = malloc(sizeof *d + l->n*sizeof l->s[0]);
+	memset(d, 0, sizeof *d);
+	d->l.s = (State**)(d+1);
+	memmove(d->l.s, l->s, l->n*sizeof l->s[0]);
+	d->l.n = l->n;
+
+	/* insert in tree */
+	*dp = d;
+	return d;
+}
+

+Nextstate runs the NFA +step +and returns the corresponding +DState: +

+DState*
+nextstate(DState *d, int c)
+{
+	step(&d->l, c, &l1);
+	return d->next[c] = dstate(&l1);
+}
+

+Finally, the DFA's start state is the +DState +corresponding to the NFA's start list: +

+DState*
+startdstate(State *start)
+{
+	return dstate(startlist(start, &l1));
+}
+

+(As in the NFA simulation, +l1 +is a preallocated +List.) +

+ +

+The +DStates +correspond to DFA states, but the DFA is only built as needed: +if a DFA state has not been encountered during the search, +it does not yet exist in the cache. +An alternative would be to compute the entire DFA at once. +Doing so would make +match +a little faster by removing the conditional branch, +but at the cost of increased startup time and +memory use. +

+ +

+One might also worry about bounding the amount of +memory used by the on-the-fly DFA construction. +Since the +DStates +are only a cache of the +step +function, the implementation of +dstate +could choose to throw away the entire DFA so far +if the cache grew too large. +This cache replacement policy +only requires a few extra lines of code in +dstate +and in +nextstate, +plus around 50 lines of code for memory management. +An implementation is +available online. +(Awk +uses a similar limited-size cache strategy, +with a fixed limit of 32 cached states; this explains the discontinuity +in its performance at n=28 in the graph above.) +

+ +

+NFAs derived from regular expressions +tend to exhibit good locality: they visit the same states +and follow the same transition arrows over and over +when run on most texts. +This makes the caching worthwhile: the first time an arrow +is followed, the next state must be computed as in the NFA +simulation, but future traversals of the arrow are just +a single memory access. +Real DFA-based implementations can make use +of additional optimizations to run even faster. +A companion article (not yet written) will explore +DFA-based regular expression implementations in more detail. +

+ + +

+Real world regular expressions +

+ +

+Regular expression usage in real programs +is somewhat more complicated than what the regular expression +implementations described above can handle. +This section briefly describes the common complications; +full treatment of any of these is beyond the scope of this +introductory article. +

+ +

+Character classes. +A character class, whether +[0-9] +or +\w +or +. (dot), +is just a concise representation of an alternation. +Character classes can be expanded into alternations +during compilation, though it is more efficient to add +a new kind of NFA node to represent them explicitly. +POSIX +defines special character classes +like [[:upper:]] that change meaning +depending on the current locale, but the hard part of +accommodating these is determining their meaning, +not encoding that meaning into an NFA. +

+ +

+Escape sequences. +Real regular expression syntaxes need to handle +escape sequences, both as a way to match metacharacters +(\(, +\), +\\, +etc.) +and to specify otherwise difficult-to-type characters such as +\n. +

+ +

+Counted repetition. +Many regular expression implementations provide a counted +repetition operator +{n} +to match exactly +n +strings matching a pattern; +{n,m} +to match at least +n +but no more than +m; +and +{n,} +to match +n +or more. +A recursive backtracking implementation can implement +counted repetition using a loop; an NFA or DFA-based +implementation must expand the repetition: +e{3} +expands to +eee; +e{3,5} +expands to +eeee?e?, +and +e{3,} +expands to +eee+. +

+ +

+Submatch extraction. +When regular expressions are used for splitting or parsing strings, +it is useful to be able to find out which sections of the input string +were matched by each subexpression. +After a regular expression like +([0-9]+-[0-9]+-[0-9]+) +([0-9]+:[0-9]+) +matches a string (say a date and time), +many regular expression engines make the +text matched by each parenthesized expression +available. +For example, one might write in Perl: +

+if(/([0-9]+-[0-9]+-[0-9]+) ([0-9]+:[0-9]+)/){
+	print "date: $1, time: $2\n";
+}
+

+The extraction of submatch boundaries has been mostly ignored +by computer science theorists, and it is perhaps the most +compelling argument for using recursive backtracking. +However, Thompson-style algorithms can be adapted to +track submatch boundaries without giving up efficient performance. +The Eighth Edition Unix +regexp(3) +library implemented such an algorithm as early as 1985, +though as explained below, +it was not very widely used or even noticed. +

+ +

+Unanchored matches. +This article has assumed that regular expressions +are matched against an entire input string. +In practice, one often wishes to find a substring +of the input that matches the regular expression. +Unix tools traditionally return the longest matching substring +that starts at the leftmost possible point in the input. +An unanchored search for +e +is a special case +of submatch extraction: it is like searching for +.*(e).* +where the first +.* +is constrained to match as short a string as possible. +

+ +

+Non-greedy operators. +In traditional Unix regular expressions, the repetition operators +?, +*, +and ++ +are defined to match as much of the string as possible while +still allowing the entire regular expression to match: +when matching +(.+)(.+) +against +abcd, +the first +(.+) +will match +abc, +and the second +will match +d. +These operators are now called +greedy. +Perl introduced +??, +*?, +and ++? +as non-greedy versions, which match as little of the string +as possible while preserving the overall match: +when matching +(.+?)(.+?) +against +abcd, +the first +(.+?) +will match only +a, +and the second +will match +bcd. +By definition, whether an operator is greedy +cannot affect whether a regular expression matches a +particular string as a whole; it only affects the +choice of submatch boundaries. +The backtracking algorithm admits a simple implementation +of non-greedy operators: +try the shorter match before the longer one. +For example, in a standard backtracking implementation, +e? +first tries using +e +and then tries not using it; +e?? +uses the other order. +The submatch-tracking variants of Thompson's algorithm +can be adapted to accommodate non-greedy operators. +

+ +

+Assertions. +The traditional regular expression metacharacters +^ +and +$ +can be viewed as +assertions +about the text around them: +^ +asserts that the previous character +is a newline (or the beginning of the string), +while +$ +asserts that the next character is a newline +(or the end of the string). +Perl added more assertions, like +the word boundary +\b, +which asserts that +the previous character is alphanumeric but the next +is not, or vice versa. +Perl also generalized the idea to arbitrary +conditions called lookahead assertions: +(?=re) +asserts that the text after the current input position matches +re, +but does not actually advance the input position; +(?!re) +is similar but +asserts that the text does not match +re. +The lookbehind assertions +(?<=re) +and +(?<!re) +are similar but make assertions about the text +before the current input position. +Simple assertions like +^, +$, +and +\b +are easy to accommodate in an NFA, +delaying the match one byte for forward assertions. +The generalized assertions +are harder to accommodate but in principle could +be encoded in the NFA. +

+ +

+Backreferences. +As mentioned earlier, no one knows how to +implement regular expressions with backreferences efficiently, +though no one can prove that it's impossible either. +(Specifically, the +problem is NP-complete, meaning that if +someone did find an efficient implementation, that would +be major news to computer scientists and would +win a million dollar prize.) +The simplest, most effective strategy for backreferences, +taken by the original awk and egrep, is not to implement them. +This strategy is no longer practical: users have come to +rely on backreferences for at least occasional use, +and backreferences are part of +the +POSIX standard for regular expressions. +Even so, it would be reasonable to use Thompson's NFA simulation +for most regular expressions, and only bring out +backtracking when it is needed. +A particularly clever implementation could combine the two, +resorting to backtracking only to accommodate the backreferences. +

+ +

+Backtracking with memoization. +Perl's approach of using memoization to avoid exponential blowup +during backtracking +when possible is a good one. At least in theory, it should make +Perl's regular expressions behave more like an NFA and +less like backtracking. +Memoization does not completely solve the problem, though: +the memoization itself requires a memory footprint roughly +equal to the size of the text times the size of the regular expression. +Memoization also does not address the issue of the stack space used +by backtracking, which is linear in the size of the text: +matching long strings typically causes a backtracking +implementation to run out of stack space: +

+$ perl -e '("a" x 100000) =~ /^(ab?)*$/;'
+Segmentation fault (core dumped)
+$
+
+ +

+Character sets. +Modern regular expression implementations must deal with +large non-ASCII character sets such as Unicode. +The +Plan 9 regular expression library +incorporates Unicode by running an NFA with a +single Unicode character as the input character for each step. +That library separates the running of the NFA from decoding +the input, so that the same regular expression matching code +is used for both +UTF-8 +and wide-character inputs. +

+ +

+History and References +

+ + +

+Michael Rabin and Dana Scott +introduced non-deterministic finite automata +and the concept of non-determinism in 1959 +[7], +showing that NFAs can be simulated by +(potentially much larger) DFAs in which +each DFA state corresponds to a set of NFA states. +(They won the Turing Award in 1976 for the introduction +of the concept of non-determinism in that paper.) +

+ +

+R. McNaughton and H. Yamada +[4] +and +Ken Thompson +[9] +are commonly credited with giving the first constructions +to convert regular expressions into NFAs, +even though neither paper mentions the +then-nascent concept of an NFA. +McNaughton and Yamada's construction +creates a DFA, +and Thompson's construction creates IBM 7094 machine code, +but reading between the lines one can +see latent NFA constructions underlying both. +Regular expression to NFA constructions differ only in how they encode +the choices that the NFA must make. +The approach used above, mimicking Thompson, +encodes the choices with explicit choice +nodes +(the +Split +nodes above) +and unlabeled arrows. +An alternative approach, +the one most commonly credited to McNaughton and Yamada, +is to avoid unlabeled arrows, instead allowing NFA states to +have multiple outgoing arrows with the same label. +McIlroy +[3] +gives a particularly elegant implementation of this approach +in Haskell. +

+ +

+Thompson's regular expression implementation +was for his QED editor running on the CTSS +[10] +operating +system on the IBM 7094. +A copy of the editor can be found in archived CTSS sources +[5]. +L. Peter Deutsch and Butler Lampson +[1] +developed the first QED, but +Thompson's reimplementation was the first to use +regular expressions. +Dennis Ritchie, author of yet another QED implementation, +has documented the early history of the QED editor +[8] +(Thompson, Ritchie, and Lampson later won +Turing awards for work unrelated to QED or finite automata.) +

+ +

+Thompson's paper marked the +beginning of a long line of regular expression implementations. +Thompson chose not to use his algorithm when +implementing the text editor ed, which appeared in +First Edition Unix (1971), or in its descendant grep, +which first appeared in the Fourth Edition (1973). +Instead, these venerable Unix tools used +recursive backtracking! +Backtracking was justifiable because the +regular expression syntax was quite limited: +it omitted grouping parentheses and the +|, +?, +and ++ +operators. +Al Aho's egrep, +which first appeared in the Seventh Edition (1979), +was the first Unix tool to provide +the full regular expression syntax, using a +precomputed DFA. +By the Eighth Edition (1985), egrep computed the DFA on the fly, +like the implementation given above. +

+ +

+While writing the text editor sam +[6] +in the early 1980s, +Rob Pike wrote a new regular expression implementation, +which Dave Presotto extracted into a library that +appeared in the Eighth Edition. +Pike's implementation +incorporated submatch tracking into an efficient NFA simulation +but, like the rest of the Eighth Edition source, was not widely +distributed. +Pike himself did not realize that his technique was anything new. +Henry Spencer reimplemented the Eighth Edition library +interface from scratch, but using backtracking, +and +released his implementation +into the public domain. +It became very widely used, eventually serving as the basis +for the slow regular expression implementations +mentioned earlier: Perl, PCRE, Python, and so on. +(In his defense, +Spencer knew the routines could be slow, +and he didn't know that a more efficient algorithm existed. +He even warned in the documentation, +“Many users have found the speed perfectly adequate, +although replacing the insides of egrep with this code +would be a mistake.”) +Pike's regular expression implementation, extended to +support Unicode, was made freely available +with sam in +late 1992, +but the particularly efficient +regular expression search algorithm went unnoticed. +The code is now available in many forms: as +part of sam, +as +Plan 9's regular expression library, +or +packaged separately for Unix. +Ville Laurikari independently discovered Pike's algorithm +in 1999, developing a theoretical foundation as well +[2]. +

+ + +

+Finally, any discussion of regular expressions +would be incomplete without mentioning +Jeffrey Friedl's book +Mastering Regular Expressions, +perhaps the most popular reference among today's programmers. +Friedl's book teaches programmers how best to use today's +regular expression implementations, but not how best to implement them. +What little text it devotes to implementation +issues perpetuates the widespread belief that recursive backtracking +is the only way to simulate an NFA. +Friedl makes it clear that he +neither understands nor respects +the underlying theory. +

+ +

+Summary +

+ +

+Regular expression matching can be simple and fast, using +finite automata-based techniques that have been known for decades. +In contrast, Perl, PCRE, Python, Ruby, Java, +and many other languages +have regular expression implementations based on +recursive backtracking that are simple but can be +excruciatingly slow. +With the exception of backreferences, the features +provided by the slow backtracking implementations +can be provided by the automata-based implementations +at dramatically faster, more consistent speeds. +

+ +

+The next article in this series, +“Regular Expression Matching: the Virtual Machine Approach,” discusses NFA-based submatch extraction. +The third article, “Regular Expression Matching in the Wild,” examines a production implementation. +The fourth article, “Regular Expression Matching with a Trigram Index,” explains how Google Code Search was implemented. +

+ +

+Acknowledgements +

+ +

+Lee Feigenbaum, +James Grimmelmann, +Alex Healy, +William Josephson, +and +Arnold Robbins +read drafts of this article and made many helpful suggestions. +Rob Pike clarified some of the history surrounding his +regular expression implementation. +Thanks to all. +

+ +

+References +

+ +

+ +[1] +L. Peter Deutsch and Butler Lampson, +“An online editor,” +Communications of the ACM 10(12) (December 1967), pp. 793–799. +http://doi.acm.org/10.1145/363848.363863 +

+ +[2] +Ville Laurikari, +“NFAs with Tagged Transitions, +their Conversion to Deterministic Automata +and +Application to Regular Expressions,” +in Proceedings of the Symposium on String Processing and +Information Retrieval, September 2000. +http://laurikari.net/ville/spire2000-tnfa.ps +

+ +[3] +M. Douglas McIlroy, +“Enumerating the strings of regular languages,” +Journal of Functional Programming 14 (2004), pp. 503–518. +http://www.cs.dartmouth.edu/~doug/nfa.ps.gz (preprint) +

+ +[4] +R. McNaughton and H. Yamada, +“Regular expressions and state graphs for automata,” +IRE Transactions on Electronic Computers EC-9(1) (March 1960), pp. 39–47. +

+ +[5] +Paul Pierce, +“CTSS source listings.” +http://www.piercefuller.com/library/ctss.html +(Thompson's QED is in the file +com5 +in the source listings archive and is marked as +0QED) +

+ +[6] +Rob Pike, +“The text editor sam,” +Software—Practice & Experience 17(11) (November 1987), pp. 813–845. +http://plan9.bell-labs.com/sys/doc/sam/sam.html +

+ +[7] +Michael Rabin and Dana Scott, +“Finite automata and their decision problems,” +IBM Journal of Research and Development 3 (1959), pp. 114–125. +http://www.research.ibm.com/journal/rd/032/ibmrd0302C.pdf +

+ +[8] +Dennis Ritchie, +“An incomplete history of the QED text editor.” +http://plan9.bell-labs.com/~dmr/qed.html +

+ +[9] +Ken Thompson, +“Regular expression search algorithm,” +Communications of the ACM 11(6) (June 1968), pp. 419–422. +http://doi.acm.org/10.1145/363347.363387 +(PDF) +

+ +[10] +Tom Van Vleck, +“The IBM 7094 and CTSS.” +http://www.multicians.org/thvv/7094.html +

+ +
+

+Discussion on reddit and perlmonks and +LtU +

+ +
+

+Copyright © 2007 Russ Cox. All Rights Reserved. +
+http://swtch.com/~rsc/regexp/ +

+
+ + + + + diff --git a/src/deps/src/zstd-nginx-module/t/suite/test.zst b/src/deps/src/zstd-nginx-module/t/suite/test.zst new file mode 100644 index 000000000..9d4e0c45e Binary files /dev/null and b/src/deps/src/zstd-nginx-module/t/suite/test.zst differ diff --git a/src/deps/src/zstd-nginx-module/valgrind.suppress b/src/deps/src/zstd-nginx-module/valgrind.suppress new file mode 100644 index 000000000..fe4f25545 --- /dev/null +++ b/src/deps/src/zstd-nginx-module/valgrind.suppress @@ -0,0 +1,218 @@ +{ + + Memcheck:Addr1 + fun:ngx_init_cycle + fun:ngx_master_process_cycle + fun:main +} +{ + + Memcheck:Addr4 + fun:ngx_init_cycle + fun:ngx_master_process_cycle + fun:main +} +{ + + Memcheck:Cond + fun:ngx_vslprintf + fun:ngx_snprintf + fun:ngx_sock_ntop + fun:ngx_event_accept + fun:ngx_epoll_process_events + fun:ngx_process_events_and_timers +} +{ + + Memcheck:Addr1 + fun:ngx_vslprintf + fun:ngx_snprintf + fun:ngx_sock_ntop + fun:ngx_event_accept +} +{ + + exp-sgcheck:SorG + fun:ngx_http_lua_ndk_set_var_get +} +{ + + exp-sgcheck:SorG + fun:ngx_http_variables_init_vars + fun:ngx_http_block +} +{ + + exp-sgcheck:SorG + fun:ngx_conf_parse +} +{ + + exp-sgcheck:SorG + fun:ngx_vslprintf + fun:ngx_log_error_core +} +{ + + Memcheck:Param + epoll_ctl(event) + fun:epoll_ctl +} +{ + + Memcheck:Cond + fun:ngx_conf_flush_files + fun:ngx_single_process_cycle +} +{ + + Memcheck:Cond + fun:memcpy + fun:ngx_vslprintf + fun:ngx_log_error_core + fun:ngx_http_charset_header_filter +} +{ + + Memcheck:Param + socketcall.setsockopt(optval) + fun:setsockopt + fun:drizzle_state_connect +} +{ + + Memcheck:Cond + fun:ngx_conf_flush_files + fun:ngx_single_process_cycle + fun:main +} +{ + + Memcheck:Leak + fun:malloc + fun:ngx_alloc + fun:ngx_event_process_init +} +{ + + Memcheck:Param + sendmsg(mmsg[0].msg_hdr) + fun:sendmmsg + fun:__libc_res_nsend +} +{ + + Memcheck:Param + sendmsg(msg.msg_iov[0]) + fun:__sendmsg_nocancel + fun:ngx_write_channel + fun:ngx_pass_open_channel + fun:ngx_start_cache_manager_processes +} +{ + + Memcheck:Cond + fun:ngx_init_cycle + fun:ngx_master_process_cycle + fun:main +} +{ + + Memcheck:Cond + fun:index + fun:expand_dynamic_string_token + fun:_dl_map_object + fun:map_doit + fun:_dl_catch_error + fun:do_preload + fun:dl_main + fun:_dl_sysdep_start + fun:_dl_start +} +{ + + Memcheck:Param + sendmsg(mmsg[0].msg_hdr) + fun:sendmmsg + fun:__libc_res_nsend + fun:__libc_res_nquery + fun:__libc_res_nquerydomain + fun:__libc_res_nsearch +} +{ + + Memcheck:Leak + match-leak-kinds: definite + fun:malloc + fun:ngx_alloc + fun:ngx_set_environment + fun:ngx_single_process_cycle +} +{ + + Memcheck:Cond + obj:* +} +{ + + Memcheck:Leak + match-leak-kinds: definite + fun:malloc + fun:ngx_alloc + fun:ngx_set_environment + fun:ngx_worker_process_init +} +{ + + Memcheck:Leak + match-leak-kinds: definite + fun:malloc + fun:ngx_alloc + fun:ngx_create_pool + fun:main +} +{ + + Memcheck:Param + epoll_pwait(sigmask) + fun:epoll_pwait + fun:epoll_wait + fun:ngx_epoll_process_events + fun:ngx_process_events_and_timers +} +{ + + Memcheck:Param + epoll_pwait(sigmask) + fun:epoll_pwait + fun:epoll_wait + fun:ngx_epoll_test_rdhup + fun:ngx_epoll_init + fun:ngx_event_process_init +} +{ + + Memcheck:Param + epoll_pwait(sigmask) + fun:epoll_pwait + fun:ngx_epoll_process_events + fun:ngx_process_events_and_timers +} +{ + + Memcheck:Param + epoll_pwait(sigmask) + fun:epoll_pwait + fun:ngx_epoll_test_rdhup + fun:ngx_epoll_init + fun:ngx_event_process_init +} +{ + + Memcheck:Leak + match-leak-kinds: possible + fun:malloc + fun:ngx_alloc + fun:ngx_crc32_table_init + fun:main +}