WiktionaryDumps to words
I misunderstood the data format and now just copy verbatim from Julia entry the processing logics ..
# 20211209 Raku programming solution
use LWP::Simple;
use Compress::Bzip2;
use IO::Socket::SSL;
my $LanguageMark = "==French==";
my $Target = 5; # words
my $URL = 'https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2';
class CustomLWP is LWP::Simple { has $.URL ;
method CustomRequest {
my Blob $resp = Buf.new;
my $bzip = Compress::Bzip2::Stream.new;
my ( $tail, %needles ) = '';
my ($host, $port, $path) = self.parse_url($.URL)[1..3];
my $sock = IO::Socket::SSL.new: :$host, :$port;
$sock.print( "GET {$path} HTTP/1.1\r\n" ~ self.stringify_headers( {
'Connection' => 'close',
'User-Agent' => "LWP::Simple/{LWP::Simple::<$VERSION>} " ~
"Raku/{$*RAKU.compiler.gist}",
'Host' => $host
} ) ~ "\r\n" ) or die ; # request string
while !self.got-header($resp) { $resp ~= $sock.read(2048) }
my $bzip-stream = supply {
emit self.parse_response($resp)[2]; # $resp_content @ parent class
loop {
done if %needles.elems >= $Target ;
( my $chunk = $sock.read(4096) ) ?? emit $chunk !! done
}
}
react {
whenever $bzip-stream -> $crypt {
my $plain = ( [~] $bzip.decompress: $crypt ).decode('utf8-c8');
my @haystacks = $plain.split: "\n";
@haystacks[0] = $tail ~ @haystacks[0];
$tail = @haystacks[*-1];
my ($title,$got_text_last) = '', False ;
for @haystacks[0..*-2] {
if / '<title>' (\w+?) '</title>' / {
($title,$got_text_last) = $0, False;
} elsif / '<text' / {
$got_text_last = True
} elsif / $LanguageMark / {
%needles{$title}++ if ( $got_text_last and $title.Bool );
last if ( %needles.elems >= $Target ) ;
$got_text_last = False;
} elsif / '</text>' / { $got_text_last = False }
}
}
}
return %needles.keys[^$Target]
}
}
my $ua = CustomLWP.new: URL => $URL ;
$ua.CustomRequest>>.say
Output:
chien
gratuit
gratis
pond
livre
Last updated