Hacker News new | past | comments | ask | show | jobs | submit login
Remove HTTP headers from gzip or zip on stdin yy054 (revised)
3 points by textmode 9 months ago | hide | past | favorite | 2 comments
.

      /* remove HTTP headers from multiple gzip or single zip from stdin */
    
     int fileno (FILE *);
     int setenv (const char *, const char *, int);
     #define jmp (yy_start) = 1 + 2 *
     int show_headers;
    %option nounput noinput noyywrap
    %%
    HTTP\/[01]\.[\15\12\40-\176]{0,1024}\r\n\r\n if(show_headers){fwrite(yytext,1,yyleng,yyout);putc(10,yyout);};
    .|\n if(!show_headers)fwrite(yytext,1,yyleng,yyout);
    %%
    int main(int argc,char *argv[])
    { 
    if(argc)if(argv[0])
    if(argv[1])show_headers++;
    yylex();
    exit(0);
    }



Correction:

      /* remove HTTP headers from multiple gzip or single zip from stdin */
    
     int fileno (FILE *);
     int setenv (const char *, const char *, int);
     #define jmp (yy_start) = 1 + 2 *
     int x;
    %option nounput noinput noyywrap
    %%
    HTTP\/[\40-\176]+\x0d\x0a x++;
    [\40-\176]+:[\40-\176]+\r\n if(!x)fwrite(yytext,1,yyleng,yyout);
    \x0D\x0A if(!x)fwrite(yytext,1,yyleng,yyout);x=0;
    %%
    int main()
    { 
    yylex();
    exit(0);
    }

Usage example:

Retrieve hostnames, IP addresses and (if available) sitemap URLs from latest Common Crawl.

     ftp -4 https://data.commoncrawl.org/crawl-data/CC-MAIN-2023-50/robotstxt.paths.gz # <-- 180K
     gzip -dc robotstxt.paths.gz \
     |head -5 \
     |sed 's>.*>GET /& HTTP/1.1[]Host: data.commoncrawl.org[]Connection: >;
           $!s/$/keep-alive[]/;$s/$/close[]/' \
     |tr [] '\r\n' \
     |openssl s_client -quiet -connect data.commoncrawl.org:443 \
     |yy054 \
     |zegrep -a '(^Sitemap:)|(^Host:)|(^WARC-Target-URI:)|(^WARC-IP-Address:)' > 1.txt
     exec cat 1.txt


Usage example:

Download NetBSD 1.0 in a single TCP connection.

    y="GET /pub/NetBSD-archive/NetBSD-1.0/source/src10/"
    z="Host: archive.netbsd.org"
    sed '$!s>.*>'"$y"'& HTTP/1.1[]'"$z"'[]Connection: keep-alive[]>;
         $s>.*>'"$y"'& HTTP/1.0[]'"$z"'[]>' << eof \
    |tr '[]' '\r\n' \
    |openssl s_client -quiet -connect 151.101.129.6:443 -servername archive.netbsd.org > http+gzip
    src10.aa
    src10.ab
    src10.ac
    src10.ad
    src10.ae
    src10.af
    src10.ag
    src10.ah
    src10.ai
    src10.aj
    src10.ak
    src10.al
    src10.am
    src10.an
    src10.ao
    src10.ap
    src10.aq
    src10.ar
    src10.as
    src10.at
    src10.au
    src10.av
    src10.aw
    src10.ax
    src10.ay
    src10.az
    src10.ba
    src10.bb
    src10.bc
    src10.bd
    src10.be
    src10.bf
    eof

    yy054 < http+gzip|tar tvzf /dev/stdin
Alternate usage:

Include an argv[1] will print HTTP headers only

    yy054 print < http+gzip
    yy054 x < http+gzip




Guidelines | FAQ | Lists | API | Security | Legal | Apply to YC | Contact

Search: