Actions

Category talk

HacDC Info: Difference between revisions

From HacDC Wiki

(Created page with "The following uncommented, poorly structured bash/perl script downloads all the pages under http://wiki.hacdc.org/index.php/Category:Meeting_Minutes and greps through them for...")
 
(format)
Line 1: Line 1:
The following uncommented, poorly structured bash/perl script downloads all the pages under http://wiki.hacdc.org/index.php/Category:Meeting_Minutes and greps through them for mentions of bylaws, standing, and rule, helpful if you want to search through old meeting minutes.  Can be modified for other purposes.  It sucks but it probably works.  Requires the HTML::LinkExtractor perl module and elinks or w3m.
The following lightly commented, poorly structured bash/perl script downloads all the pages under http://wiki.hacdc.org/index.php/Category:Meeting_Minutes and greps through them for mentions of bylaws, standing, and rule, helpful if you want to search through old meeting minutes.  Can be modified for other purposes.  It sucks but it probably works for its intended purpose.  Requires the HTML::LinkExtractor perl module and elinks or w3m.


#!/bin/bash
    #!/bin/bash


path="/tmp/hacdcminutes"
    path="/tmp/hacdcminutes"


get_links() {
    get_links() {
  perl -E '  
        perl -E '  
        use HTML::LinkExtractor;
            use HTML::LinkExtractor;
        use Data::Dumper;
            use Data::Dumper;


        local $/;
            local $/;
        my $input = <STDIN>;
            my $input = <STDIN>;
        #chomp( $input );
            #chomp( $input );
        my $LX = new HTML::LinkExtractor();
            my $LX = new HTML::LinkExtractor();
        $LX->strip(1);
            $LX->strip(1);


        $LX->parse(\$input);
            $LX->parse(\$input);




        my @links = @{$LX->links};  
            my @links = @{$LX->links};  


        foreach my $link ( @links ){
            foreach my $link ( @links ){
            $count++;
                $count++;
            last if ($link->{"href"} =~ ".*wiki.hacdc.org/index.php.title.Category.Meeting_Minutes.oldid.6639");
                last if ($link->{"href"} =~ ".*wiki.hacdc.org/index.php.title.Category.Meeting_Minutes.oldid.6639");
            if ( ($link->{"href"} =~ /.*index.php.*/) && !($link->{"_TEXT"} =~ "/^\s*$/")){
                if ( ($link->{"href"} =~ /.*index.php.*/) && !($link->{"_TEXT"} =~ "/^\s*$/")){
                my $href = $link->{"href"};
                    my $href = $link->{"href"};
                $href =~ s/.*http/http/;  
                    $href =~ s/.*http/http/;  
                print "$href \n"; # > page.$count \n";
                    print "$href \n"; # > page.$count \n";
                }
             }
             }
        }
            __END__
         __END__
         '
     '
     }
}


#uastr="Mozilla/5.0 (Windows NT 6.3; WOW64) Chrome/41.0.2226.0 Safari/537.36"
    #only needed if you use curl or wget
url="http://wiki.hacdc.org/index.php/Category:Meeting_Minutes"
    #uastr="Mozilla/5.0 (Windows NT 6.3; WOW64) Chrome/41.0.2226.0 Safari/537.36"
#baseurl="http://wiki.hacdc.org"
    url="http://wiki.hacdc.org/index.php/Category:Meeting_Minutes"
cd "$path" || exit;
    #baseurl="http://wiki.hacdc.org"
set -v
    cd "$path" || exit;
#can use w3m instead of elinks.  maybe also lynx or links?
    set -v
elinks -source $url | get_links | sed "s/^/http\:\/\/wiki\.hacdc\.org/g" | while read -r line; do count=$(expr $count + 1); echo "$line"; elinks -dump "$line" > "$count.txt"; done
    #can use w3m instead of elinks.  maybe also lynx or links?
    elinks -source $url | get_links | sed "s/^/http\:\/\/wiki\.hacdc\.org/g" | while read -r line; do count=$(expr $count + 1); echo "$line"; elinks -dump "$line" > "$count.txt"; done


grep rule -C 3 > rule.txt
    grep rule -C 3 > rule.txt
grep standing -C 3 > standing.txt
    grep standing -C 3 > standing.txt
grep bylaw -C 3 > bylaw.txt
    grep bylaw -C 3 > bylaw.txt

Revision as of 04:44, 20 May 2016

The following lightly commented, poorly structured bash/perl script downloads all the pages under http://wiki.hacdc.org/index.php/Category:Meeting_Minutes and greps through them for mentions of bylaws, standing, and rule, helpful if you want to search through old meeting minutes. Can be modified for other purposes. It sucks but it probably works for its intended purpose. Requires the HTML::LinkExtractor perl module and elinks or w3m.

   #!/bin/bash
   path="/tmp/hacdcminutes"
   get_links() {
       perl -E ' 
           use HTML::LinkExtractor;
           use Data::Dumper;
           local $/;
           my $input = <STDIN>;
           #chomp( $input );
           my $LX = new HTML::LinkExtractor();
           $LX->strip(1);
           $LX->parse(\$input);


           my @links = @{$LX->links}; 
           foreach my $link ( @links ){
               $count++;
               last if ($link->{"href"} =~ ".*wiki.hacdc.org/index.php.title.Category.Meeting_Minutes.oldid.6639");
               if ( ($link->{"href"} =~ /.*index.php.*/) && !($link->{"_TEXT"} =~ "/^\s*$/")){
                   my $href = $link->{"href"};
                   $href =~ s/.*http/http/; 
                   print "$href \n"; # > page.$count \n";
               }
           }
           __END__
       '
   }
   #only needed if you use curl or wget
   #uastr="Mozilla/5.0 (Windows NT 6.3; WOW64) Chrome/41.0.2226.0 Safari/537.36"
    url="http://wiki.hacdc.org/index.php/Category:Meeting_Minutes"
   #baseurl="http://wiki.hacdc.org"
   cd "$path" || exit;
   set -v
   #can use w3m instead of elinks.  maybe also lynx or links?
   elinks -source $url | get_links | sed "s/^/http\:\/\/wiki\.hacdc\.org/g" | while read -r line; do count=$(expr $count + 1); echo "$line"; elinks -dump "$line" > "$count.txt"; done
   grep rule -C 3 > rule.txt
   grep standing -C 3 > standing.txt
   grep bylaw -C 3 > bylaw.txt