#!/usr/bin/perl -w
use utf8;
use File::Basename;
use Data::Printer;
use Data::Dumper;
use Parallel::Iterator qw( iterate );
use Env qw/PWD/;
use JSON::XS;
use HTML::Strip;
use AI::MicroStructure::Util;
use WWW::Wikipedia;
use Storable::CouchDB;
use LWP::UserAgent;
use HTML::SimpleLinkExtor;
use URI::Escape qw( uri_unescape );
our $e = HTML::SimpleLinkExtor->new;

binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';


my $state  = AI::MicroStructure::Util::config();

my @CWD    = $state->{cwd};
our $config = $state->{cfg};


die("require a argument") unless($ARGV[0]);

our $doc ={};
our @links;
our $linkdata = {};
our $result;

my $url = $ARGV[0];
my @inx;
my @test;
my $search="";
my $TOP="wikipedia";


our $couchdb = Storable::CouchDB->new(
uri =>$config->{couchdb},  #default
db  =>$config->{db}
) or die(@$);



my $carry = {count=>0,max=>0};



sub list_iter {
           my @ar = @_;
           my $pos = 0;
           return sub {
               return if $pos >= @ar;
               my @r = ( $pos, $ar[$pos] );  # Note: returns ( index, value )
               $pos++;
               return @r;
           };
       }

sub down_iter {
           my @ar = @_;
           my $pos = 0;
           return sub {
               return if $pos >= @ar;
               my @r = ( $pos, $ar[$pos] );  # Note: returns ( index, value )
               $pos++;
               return @r;
           };
       }


sub checkIsThere {

my $key =shift;


require LWP::UserAgent;
my $ua = LWP::UserAgent->new;
my ($server,$db) = ($config->{couchdb},$config->{db});

my $res = $ua->get(sprintf('%s/%s/_all_docs?key="%s"',
                            $server,
                            $db,
                            $key));
my $r =  JSON::XS::decode_json($res->content);
my @rows =  @{$r->{rows}};
if(@rows)
{
return 1;
}else{
return 0;
}
return 1;
}

sub URLDecode {
my $theURL = $_[0];
$theURL =~ tr/+/ /;
$theURL =~ s/%([a-fA-F0-9]{2,2})/chr(hex($1))/eg;
$theURL =~ s/<!--(.|\n)*-->//g;
return $theURL;
}

sub URLEncode {
my $theURL = $_[0];
$theURL =~ s/([\W])/"%" . uc(sprintf("%2.2x",ord($1)))/eg;
return $theURL;
}

sub smartdecode {
use URI::Escape qw( uri_unescape );
use utf8;
my $x = my $y = uri_unescape($_[0]);
return $x if utf8::decode($x);
return $y;
}

sub imgTranslate {

my ($idx,$url) = @_;


if($url){


my $request = HTTP::Request::Common::GET($url);
#my $response = $request->request($url);
  $e->parse($request->decoded_content);

  return $e->{img};
}
}



sub call  {

my ($idx,$url) = @_;

#return () unless (!checkIsThere($url) || $idx==0 || $config->{out}=~/json/);


warn "doing ", $url;

my $ua = LWP::UserAgent->new;


my $content ;
my $response ="";
my @book = ();


$response  = $ua->get(sprintf("%s%s",$config->{wikipedia}, ucfirst($url)));






      my $doc={};
      my $linkdata={};




      my $wiki = WWW::Wikipedia->new();
      my $hs = HTML::Strip->new();

      my $result = $wiki->search(ucfirst $url);
      if (defined($result) && $result->text() ) {

      my $clean_text = $hs->parse($result->text() );
      $hs->eof;

      require HTML::SimpleLinkExtor;
      no warnings 'utf8';
      my $e = HTML::SimpleLinkExtor->new($response->base);
      $e->parse($response->decoded_content);
      my @all_links = $e->links;
      @links = grep {/$TOP|.pdf|.ogg|.mp3|.mpg|.mkv|.mp4|.avi|.wav|.gif|.jpeg|.jpg|.png|.svg|.tif|.tiff|.bmp/i}grep{!/Disambig|Help:|Wikipedia:|Special:|:Contents|:Featured_content|Main_Pag|_talk:|Talk:|#|[Aa]rticle[s|_]|All_*_*/}map{$_="$_"}@all_links;


      $linkdata->{pdf}=[grep{/^http.*.[\.](pdf)$/i}@links];

      $linkdata->{image}=[grep{/http.*.[\.](svg|jpeg|jpg|png|gif|tif|tiff|bmp)$/i}@links];

      $linkdata->{audio}=[grep{/^http.*.[\.](mp3|mpg|wav|ogg|avi|mkv|mp4)$/i}@links];

    my $url_iter = down_iter( @{$linkdata->{image}});
    my $page_iter = iterate( \&imgTranslate, $url_iter );

    while ( my ( $index, $value ) = $page_iter->() ) {

    }










           $doc->{linknr}=$#links;
           $doc->{url}=$url;
           $doc->{tags}=[map{$_=lc($_); $_=~s/\)//g; $_=~s/ /_/g; $_=[split("_\\(",$_)] }grep {/[(].+?[)]/} $result->related()];
           $doc->{instances}= {};
           $doc->{members}={};

           my $ltxt = join("\n",@links);
              $ltxt =~ s/http:\/\/en.wikipedia.org\/wiki\///g; # FIXME replace with $config->{wikipedia}

           push @{$doc->{tags}},map{$_=lc($_); $_=~s/\)//g; $_=~s/ /_/g; $_=[split("_\\(",$_)] }grep {/[(].+?[)]/} split("\n",$ltxt);
           foreach(@{$doc->{tags}}){
            if($_->[0] && $_->[1]){
            $doc->{instances}->{lc $_->[0]}=lc $_->[1];
            push @test,lc $_->[0];
            }
#              push @test,$_->[1];


           }

           my @instances = [grep{!/\W/}values %{$doc->{instances}}];

             $doc->{members}=[grep{!/\W/}keys %{$doc->{instances}}];
             $doc->{instances}=[grep{!/\W/}values %{$doc->{instances}}];
#                 push @{$doc->{instances}},lc($url);

             $doc->{article}=$clean_text;
             $doc->{links}=@links;
             $doc->{pdf}=$linkdata->{pdf};
             $doc->{audio}=$linkdata->{audio};
             $doc->{image}=$linkdata->{image};
             # FIXME replace with $config->{wikipedia}
             $doc->{book}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Book:/}@{$doc->{links}}];
             $doc->{portal}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Portal:/}@{$doc->{links}}];
             $doc->{cat}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Category:/}@{$doc->{links}}];
             $doc->{list}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/List_of/}@{$doc->{links}}];
             $doc->{tmpl}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Template:/}@{$doc->{links}}];
             $doc->{recorded}=time;




    $doc->{$_} = $_=~/full/? smartdecode($hs->parse($result->{$_})) : smartdecode($result->{$_}) for qw(            cursor
                                          related
                                          categories
                                          headings
                                          currentlang);

            $doc->{related} = [grep{!/#/}map{my $a = $_; $a =~ s/ /_/g; $_=ucfirst $a;}@{$doc->{related}}];
            $doc->{size} = length sprintf($doc);


        eval '$couchdb->store("$url" ,$doc)' or warn "error: $@\n";
        createJsonFile($url,$doc);


    return $doc;

}


}



our @out = ();
foreach my $urlx (@ARGV) {
if($urlx) { $result->{$urlx} = call(0,$urlx); }



if(defined($result->{$urlx}) && @{$result->{$urlx}->{related}}){

  my $url_iter = list_iter( @{$result->{$urlx}->{related}});


  my $page_iter = iterate( \&call, $url_iter );

      while ( my ( $index, $value ) = $page_iter->() ) {
         $out[$index] = $value;
         print "$index\n";


  }



}

}



sub createJsonFile{
my $uri = shift;
my $json = shift;



my $p = dirname($config->{jsonout});
print `mkdir -p $p`;
open (MYFILE,sprintf( '>%s-%s.json',$config->{jsonout},$uri));
print MYFILE encode_json($json);
close (MYFILE);

}




1;

__DATA__

#!/usr/bin/perl -X
use utf8;
use File::Spec;
use Data::Printer;
use Parallel::Iterator qw( iterate );
use Env qw/PWD/;
use JSON::XS;
use HTML::Strip;
use Digest::MD5 qw(md5_hex);
use AI::MicroStructure::WordBlacklist;
use AI::MicroStructure::util;
use WWW::Wikipedia;
use Storable::CouchDB;
use LWP::UserAgent;
use HTML::SimpleLinkExtor;
use URI::Escape qw( uri_unescape );
use HTTP::Request::Common;
no warnings 'utf8';



binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';
push @ARGV,"Line";
die("require a argument") unless($ARGV[0]);
my $state = AI::MicroStructure::util::load_config(); my @CWD=$state->{cwd}; my $config=$state->{cfg};
$config->{couchdb}    ||= "http://localhost:5984/";
$config->{conceptimg} ||= "http://localhost/tiny/concept2.php";
$config->{wikipedia}  ||= "http://en.wikipedia.org/wiki/";
$config->{buffer} = "";
$config->{tempdir} ||= sprintf("/tmp/micro-temp/%s",$ARGV[0]);
$config->{language} ||= "en";
$config->{out} ||= "json";
$config->{"abs"} ||= 0;
$config->{download} ||= 1;

$config->{db} = $config->{out} =~ /wiki/ ? "micro-relations" : sprintf("%s/%s-relations.json",$CWD[0],$ARGV[0]);



our $doc ={};
our @links;
our $linkdata = {};
our $result;
my $url = $ARGV[0];
our $done;
our $doneit={};
my @inx;
my @test;
my $search="";
my $TOP="";

my $s=AI::MicroStructure::WordBlacklist::getStopWords('en');
our @s = keys %$s;
#print join('|',@s);




our $couch = Storable::CouchDB->new(
        uri =>$config->{couchdb},  #default
        db  =>$config->{db}
       );


my $carry = {count=>0,max=>0};



sub list_iter {
           my @ar = @_;
           my $pos = 0;
           return sub {
               return if $pos >= @ar;
               my @r = ( $pos, $ar[$pos] );  # Note: returns ( index, value )
               $pos++;
               return @r;
           };
       }


sub checkIsThere {

my $key =shift;


my $ua = LWP::UserAgent->new;
my ($server,$db) = ($config->{couchdb},$config->{db});

my $res = $ua->get(sprintf('%s/%s/_all_docs?key="%s"',
                            $server,
                            $db,
                            $key));
my $r =  JSON::XS::decode_json($res->content);
my $rows = $r && $r->{rows} && @{$r->{rows}};
if($rows)
{
return 1;
}else{
return 0;
}
return 1;
}

sub URLDecode {
my $theURL = $_[0];
$theURL =~ tr/+/ /;
$theURL =~ s/%([a-fA-F0-9]{2,2})/chr(hex($1))/eg;
$theURL =~ s/<!--(.|\n)*-->//g;
return $theURL;
}

sub URLEncode {
my $theURL = $_[0];
$theURL =~ s/([\W])/"%" . uc(sprintf("%2.2x",ord($1)))/eg;
return $theURL;
}

sub smartdecode {

my $x = my $y = uri_unescape($_[0]);
return $x if utf8::decode($x);
return $y;
}

sub imgTranslate {


my $url = shift;

if($url){
my $request = HTTP::Request::Common::GET($url);

my  $ua = LWP::UserAgent->new;
my $response = $ua->request($request);

if ($response->is_success) {
    print ".";

my $e = HTML::SimpleLinkExtor->new($response->base);
   $e->parse($response->decoded_content);

    return \@{$e->img};
}
else {
    print "E";
    #print STDERR $response->status_line, "\n";
    return ();
}
}
}




sub addAbs {

my ($doc,$out,$type) = @_;

return unless($doc && $out && $type);
my $to = $out->{abs}->{$type};

print "[abs+$type]";

 foreach(@{$doc->{$type}}) {

 $to->{$_} = defined($to->{$_})? $to->{$_}+1 : 1 if($_);

 }

}


sub delAbs {

my ($out,$type) = @_;

return unless( $out && $type);

print "[abs-$type]";

 foreach(sort keys %{$out->{abs}->{$type}}) {

 delete($out->{abs}->{$type}->{$_}) unless($out->{abs}->{$type}->{$_}>1);
 }

}

sub call  {

my ($idx,$url) = @_;
print "\ncheckout $url\n";

#return () unless (!checkIsThere($url) || $idx==0 || $config->{out}=~/json/);

# warn "doing $url";
#

#  return unless($url=~/Planet/);

my $ua = LWP::UserAgent->new;



my $content ;
my $response ="";
# @book = ();

my @book = ();
$response  = $ua->get(sprintf("%s%s",$config->{wikipedia}, ucfirst($url)));



print "[links]";

#      if ($response->base =~ /m{$TOP}/ ) {
      my $doc={};
      my $linkdata={};
      $linkdata->{base} = {};

      my $e = HTML::SimpleLinkExtor->new($response->base);
      $e->parse($response->decoded_content);

      my @links = [$e->all];

      print "[$#links]";


      my $wiki = WWW::Wikipedia->new(language=>$config->{language});
      my $hs = HTML::Strip->new();

  my $result = $wiki->search(ucfirst $url);
  if (defined($result) && $result->text() ) {

  my $clean_text = $hs->parse($result->text() );
  $hs->eof;

#            print "\n"x10,$clean_text;

           $doc={};
           $doc->{linknr}=$#links;
           $doc->{url}=$url;
           $doc->{tags}=[map{$_=lc($_); $_=~s/\)//g; $_=~s/ /_/g; $_=[split("_\\(",$_)] }grep {/[(].+?[)]/} $result->related()];
           $doc->{instances}= {};
           $doc->{members}={};

           my $ltxt = join("\n",@links);
              $ltxt =~ s/http:\/\/en.wikipedia.org\/wiki\///g; # FIXME replace with $config->{wikipedia}

           push @{$doc->{tags}},map{$_=lc($_); $_=~s/\)//g; $_=~s/ /_/g; $_=[split("_\\(",$_)] }grep {/[(].+?[)]/} split("\n",$ltxt);
           foreach(@{$doc->{tags}}){
            if($_->[0] && $_->[1]){
            $doc->{instances}->{lc $_->[0]}=lc $_->[1];
            push @test,lc $_->[0];
            }
#              push @test,$_->[1];


           }

           my @instances = [grep{!/\W/}values %{$doc->{instances}}];

             $doc->{members}=[grep{!/\W/}keys %{$doc->{instances}}];
             $doc->{instances}=[grep{!/\W/}values %{$doc->{instances}}];
#                 push @{$doc->{instances}},lc($url);

             $doc->{article}=$clean_text;
             $doc->{links}=@links;
             $doc->{audio}=$linkdata->{audio};
             # FIXME replace with $config->{wikipedia}
             $doc->{book}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Book:/}@{$doc->{links}}];
             $doc->{portal}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Portal:/}@{$doc->{links}}];
             $doc->{cat}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Category:/}@{$doc->{links}}];
             $doc->{list}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/List_of/}@{$doc->{links}}];
             $doc->{tmpl}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Template:/}@{$doc->{links}}];
             $doc->{pdf}=$linkdata->{pdf};
             $doc->{recorded}=time;


#        @{$doc->{image}} = map{$_=shift imgTranslate($_)} @{$doc->{image}};
#        @{$doc->{image}} = grep{ defined }@{$doc->{image}};



    $doc->{$_} = $_=~/full/? smartdecode($hs->parse($result->{$_})) : smartdecode($result->{$_}) for qw(            cursor
                                          related
                                          categories
                                          headings
                                          currentlang);

            $doc->{related} = [grep{!/#/}map{my $a = $_; $a =~ s/ /_/g; $_=ucfirst $a;}@{$doc->{related}}];
            $doc->{size} = length sprintf($doc);



   $couch->store("$url" ,$doc) unless($config->{out}=~/json/);

    print "[done]\n"; return $doc;

}


}

foreach my $urlx (@ARGV) {
if($urlx) { $result->{$urlx} = call(0,$urlx); print "\n"; }



if(defined($result->{$urlx}) && @{$result->{$urlx}->{related}}){

my $url_iter = list_iter( @{$result->{$urlx}->{related}});


my $page_iter = iterate( \&call, $url_iter );
our $out = {abs=>{pdf=>{},related=>{},image=>{},audio=>{}}};

 while ( my ( $index, $value ) = $page_iter->() ) {
     $out->{$index} = $value;
#     addAbs($out->{$index},$out,$_) for(qw(related image pdf audio));

    print "\n";
 }

#  delAbs($out,$_) for (qw(related));

    print "\n";

}

}


sub createJsonFile{

my $json = shift;


open (MYFILE, '>'.$config->{db});
print MYFILE $json;
close (MYFILE);
p $json;
}


BEGIN{
#    `IFS_BAK=\$IFS;
#      IFS=\$'\n';`;


}
END{
#   `IFS=\$IFS_BAK;`;
#
 if($config->{out}=~/json/) {



my $json = JSON::XS->new->pretty(1)->encode({ "query" => [@ARGV],
                                            "responce" =>[$config->{"abs"}==1 ? $out->{abs}:$out]});

createJsonFile($json);
$config->{buff} = $json;
p $config;
}

exit(0);

}



1;

__END__


if($config->{download}==1){
`mkdir -p $config->{tempdir}`;
`cd $config->{tempdir}`;
 `mkdir -p $config->{tempdir}"/$_";` for(qw(json image audio pdf));
 `cd $config->{tempdir}"/pdf"`;



 my @pdfs = keys %{$out->{abs}->{pdf}};
 foreach (sort @pdfs){
  my @f = split("/",$_);
   my $f = pop  @f;

  system(sprintf("wget %s -T 60 -O  %s/pdf/%s &",$_,$config->{tempdir},$f)) unless(-f sprintf("%s/pdf/%s",$config->{tempdir},$f));
  }

`cd $config->{tempdir}/audio`;
  my @oggs = keys %{$out->{abs}->{audio}};
 foreach (sort @oggs){
  my @f = split("/",$_);
   my $f = pop  @f;

  system(sprintf("wget %s -T 60 -O  %s/audio/%s;",$_,$config->{tempdir},$f)) unless(-f sprintf("%s/audio/%s",$config->{tempdir},$f));

 }

`cd $config->{tempdir}/image`;
  my @images = keys %{$out->{abs}->{image}};
 foreach (sort @images){
  my @f = split("/",$_);
   my $f = pop  @f;

  system(sprintf("wget %s -T 60 -O  %s/image/%s;",$_,$config->{tempdir},$f)) unless(-f sprintf("%s/image/%s",$config->{tempdir},$f));

 }

#      @pdfs = keys %{$out->{abs}->{pdf}};
#      foreach (sort @pdfs){
#       my @f = split("/",$_);
#        my $f = pop  @f;

#       if(-f sprintf("%s/pdf/%s",$config->{tempdir},$f)) {
#      system(sprintf("pdfimages  -f 1 -l 10  %s/pdf/%s %s/image/%s;",$config->{tempdir},$f,$config->{tempdir},$f));
#     }
#      }

my $x = sprintf("%s %s/json",$config->{db} ,$config->{tempdir});
`cp $x`;

`cd $config->{tempdir}/pdf;  micro-rename ;  micro-steemer`;
#  my $pdfd = $config->{tempdir}."/pdf";
#  `cd $pdfd`;
# `for i in $(ls *.pdf); do pdftotext $i ../text/$i;  done;`;




     foreach(@{$value->{related}}) {

     $out->{abs}->{related}->{$_} = defined($out->{abs}->{related}->{$_})? $out->{abs}->{related}->{$_}++ : 1;

     }

     foreach(@{$value->{pdf}}) {

     $out->{abs}->{pdf}->{$_} = defined($out->{abs}->{pdf}->{$_})? $out->{abs}->{pdf}->{$_}++ : 1;

     }
     foreach(@{$value->{image}}) {

     $out->{abs}->{image}->{$_} = defined($out->{abs}->{image}->{$_})? $out->{abs}->{image}->{$_}++ : 1;

     }
     foreach(@{$value->{audio}}) {

     $out->{abs}->{ogg}->{$_} = defined($out->{abs}->{ogg}->{$_})? $out->{abs}->{ogg}->{$_}++ : 1;

     }
