Chapter 5 Writing a Content Generator
5.2 The Request, the Response, and the Environment
5.2.2 Reading Form Data
We now have the basis for reading input data. But the data are useful only if we know what to do with them. The most common form of data we need to handle on the Web is data sent to us by a web browser submitting an HTML form. Such data follow one of two standard formats supported by general-purpose browsers and controlled by the enctypeattribute to the <form>element in HTML:
• application/x-www-form-urlencoded (normal web forms submitted
either by POSTorGET)
• multipart/form-data(Netscape’s multipart format for file upload forms)
Historically, decoding form data in either of these formats is the responsibility of applications. For example, any CGI library or scripting module contains code for handling this task. Apache itself doesn’t include this capability as standard, but it is provided by third-party modules such as mod_formandmod_upload.
Parsing Form Data
The format for standard form data (application/x-www-form-urlencoded) is
a series of key/value pairs, separated by ampersands (“&”). Any character may be escaped using a %nn sequence, where nn is the hex representation of a byte, and some characters must be escaped. Parsing the data is complicated by the fact that keys are not always unique; for example, an HTML <select multiple>element
may submit several values for a key.
The natural structure representing these data is a tableofbags. This structure can be represented in Apache as an apr_hash_t*(hash table) of apr_array_header_t*
(array) values. We can parse input data into this representation as follows:
/* Parse form data from a string. The input string is NOT preserved. */ static apr_hash_t *parse_form_from_string(request_rec *r, char *args) {
apr_hash_t *form;
apr_array_header_t *values;
char *pair; char *eq;
const char *delim = "&"; char *last; char **ptr; if (args == NULL) { return NULL; } form = apr_hash_make(r->pool);
/* Split the input on '&' */
for (pair = apr_strtok(args, delim, &last); pair != NULL; pair = apr_strtok(NULL, delim, &last)) {
for (eq = pair; *eq; ++eq) { if (*eq == '+') {
*eq = ' '; }
}
/* split into Key / Value and unescape it */ eq = strchr(pair, '='); if (eq) { *eq++ = '\0'; ap_unescape_url(pair); ap_unescape_url(eq); } else { eq = ""; ap_unescape_url(pair); }
/* Store key/value pair in our form hash. Given that there * may be many values for the same key, we store values * in an array (which we'll have to create the first * time we encounter the key in question).
*/
values = apr_hash_get(form, pair, APR_HASH_KEY_STRING); if (values == NULL) {
values = apr_array_make(r->pool, 1, sizeof(const char*)); apr_hash_set(form, pair, APR_HASH_KEY_STRING, values); } ptr = apr_array_push(values); *ptr = apr_pstrdup(r->pool, eq); } return form; }
This scheme is based on parsing the entire input data from a single input buffer. It works well where the total size of a form submission is reasonably small, as is gen- erally the case with normal web forms. We should guard against denial of service (DoS) attacks by limiting the size of inputs accepted this way (the maximum size of data to accept being specified by a server administrator). Alternative methods involving streamed parsing may be appropriate for larger forms, particularly those involving file upload that could involve megabytes or even gigabytes of data. The
mod_upload3module provides a parser that is better suited to large uploads.
We can use the function we just defined to parse data submitted by GET: static apr_hash_t* parse_form_from_GET(request_rec *r)
{
return parse_form_from_string(r, r->args); }
Parsing data submitted by POSTis more work, because we have to read the data: /* Get POSTed data. Assume we have already checked that the
* content type is application/x-www-form-urlencoded. * Assume *form is null on entry.
*/
static int parse_form_from_POST(request_rec *r, apr_hash_t **form) {
int bytes, eos; apr_size_t count; apr_status_t rv; apr_bucket_brigade *bb; apr_bucket_brigade *bbin; char *buf; apr_bucket *b;
const char *clen = apr_table_get(r->headers_in, "Content-Length"); if (clen != NULL) {
bytes = strtol(clen, NULL, 0); if (bytes >= MAX_SIZE) {
ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, "Request too big (%d bytes; limit %d)", bytes, MAX_SIZE); return HTTP_REQUEST_ENTITY_TOO_LARGE; } } else { bytes = MAX_SIZE; }
140 Chapter 5 • Writing a Content Generator
bb = apr_brigade_create(r->pool, r->connection->bucket_alloc); bbin = apr_brigade_create(r->pool, r->connection->bucket_alloc); count = 0;
do {
rv = ap_get_brigade(r->input_filters, bbin, AP_MODE_READBYTES, APR_BLOCK_READ, bytes);
if (rv != APR_SUCCESS) {
ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r, "failed to read form input"); return HTTP_INTERNAL_SERVER_ERROR; } for (b = APR_BRIGADE_FIRST(bbin); b != APR_BRIGADE_SENTINEL(bbin); b = APR_BUCKET_NEXT(b) ) { if (APR_BUCKET_IS_EOS(b)) { eos = 1; } if (!APR_BUCKET_IS_METADATA(b)) { if (b->length != (apr_size_t)(-1)) { count += b->length; if (count > MAX_SIZE) {
/* This is more data than we accept, so we're * going to kill the request. But we have to * mop it up first. */ apr_bucket_delete(b); } } } if (count <= MAX_SIZE) { APR_BUCKET_REMOVE(b); APR_BRIGADE_INSERT_TAIL(bb, b); } } } while (!eos);
/* OK, done with the data. Kill the request if we got too much data. */ if (count > MAX_SIZE) {
ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r, "Request too big (%d bytes; limit %s)", bytes, MAX_SIZE);
return HTTP_REQUEST_ENTITY_TOO_LARGE; }
/* We've got all the data. Now put it in a buffer and parse it. */ buf = apr_palloc(r->pool, count+1);
rv = apr_brigade_flatten(bb, buf, &count); if (rv != APR_SUCCESS) {
ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r, "Error (flatten) reading form data"); return HTTP_INTERNAL_SERVER_ERROR;
}
buf[count] = '\0';
*form = parse_form_from_string(r, buf);
return OK; }
At this point, we have laid the groundwork to ensure easy access to form data, and we can provide some accessor functions. mod_formperforms a similar function, but
uses techniques we haven’t encountered yet to offer a cleaner API wherein the han- dler module need not concern itself with the hash.
The following example shows a function that returns all values for a key as a comma-separated string, a representation that will be familiar to users of scripting environments such as Perl (with CGI.pm) or PHP. Other high-level accessors are now similarly straightforward to write.
char *form_value(apr_pool_t *pool, apr_hash_t *form, const char *key) {
apr_array_header_t *v_arr = apr_hash_get(form, key, APR_HASH_KEY_STRING);
/* Caveat: this is ambiguous because values may contain commas */ return apr_array_pstrcat(pool, v_arr, ',');
}
Combining these functions, we can update our HelloWorld handler to display
form data. We’ll assume that the form data consist of ASCII input, and substitute question marks for any non-ASCII characters:
static int helloworld_handler(request_rec *r) {
apr_hash_t *formdata = NULL; int rv = OK;
if (!r->handler || (strcmp(r->handler, "helloworld") != 0)) { return DECLINED;
}
/* We could be just slightly sloppy and drop this altogether, * but it's good practice to reject anything that's not explicitly * allowed. It cuts off *potential* exploits for someone trying * to compromise the server.
*/
if ((r->method_number != M_GET) && (r->method_number != M_POST)) { return HTTP_METHOD_NOT_ALLOWED;
}
ap_set_content_type(r, "text/html;charset=ascii");
ap_rputs("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n" "<html><head><title>Apache HelloWorld Module</title></head>" "<body><h1>Hello World!</h1>"
"<p>This is the Apache HelloWorld module!</p>", r);
/* Print the tables */
printtable(r, r->headers_in, "Request Headers", "Header", "Value"); printtable(r, r->headers_out, "Response Headers", "Header", "Value"); printtable(r, r->subprocess_env, "Environment", "Variable", "Value");
/* Display the form data */ if (r->method_number == M_GET) {
formdata = parse_form_from_GET(r); }
else if (r->method_number == M_POST) {
const char* ctype = apr_table_get(r->headers_in, "Content-Type"); if (ctype && (strcasecmp(ctype,
"application/x-www-form-urlencoded") == 0)) { rv = parse_form_from_POST(r, &formdata); } } if (rv != OK) {
ap_rputs("<p>Error reading form data!</p>", r); }
else if (formdata == NULL) {
ap_rputs("<p>No form data found.</p>", r); }
else {
/* Parsed the form successfully, so we have data to display */ apr_array_header_t *arr; char *key; apr_ssize_t klen; apr_hash_index_t *index; char *val; char *p;
ap_rprintf(r, "<h2>Form data supplied by method %s</h2>\n<dl>", r->method) ;
for (index = apr_hash_first(r->pool, formdata); index != NULL; index = apr_hash_next(index)) {
apr_hash_this(index, (void**)&key, &klen, (void**)&arr); ap_rprintf(r, "<dt>%s</dt>\n",ap_escape_html(r->pool, key)); for (val = apr_array_pop(arr); val != NULL;
val = apr_array_pop(arr)) { for (p = val; *p != '\0'; ++p) {
if (!isascii(*p)) { *p = '?';
}
} ap_rprintf(r, "<dd>%s</dd>\n", ap_escape_html(r->pool, val)); } } ap_rputs("</dl>", r) ; } ap_rputs("</body></html>", r) ; return OK ; }