[Mailman-Users] Adding Namazu search to Mailman archives

Tom Morrison t.morrison at liant.com
Fri Jul 16 22:48:13 CEST 2004


I had a lot of problems with the existing pipermail.pl filter, so I took the
mhonarc.pl filter, which seemed to be similar, and created what is, in my
opinion at least, a better pipermail.pl to be found below.  (Watch out for
wrapped regular expressions...)

Best regards,
Tom Morrison
=======================
#
# -*- Perl -*-
#
#     This is free software with ABSOLUTELY NO WARRANTY.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either versions 2, or (at your option)
#  any later version.
#
#  This program is distributed in the hope that it will be useful
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#  02111-1307, USA
#

package pipermail;
use strict;
require 'util.pl';
require 'gfilter.pl';
require 'html.pl';
require 'mailnews.pl';

#
# This pattern specifies pipermail's file names.
#
my $PIPERMAIL_MESSAGE_FILE = '\d{6}\.html';

sub mediatype() {
    return ('text/html; x-type=pipermail');
}

sub status() {
    return 'yes';
}

sub recursive() {
    return 0;
}

sub pre_codeconv() {
    return 1;
}

sub post_codeconv () {
    return 0;
}

sub add_magic ($) {
    return;
}

sub filter ($$$$$) {
    my ($orig_cfile, $contref, $weighted_str, $headings, $fields)
      = @_;
    my $cfile = defined $orig_cfile ? $$orig_cfile : '';

    util::vprint("Processing pipermail file ...\n");

    unless ($cfile =~ /($PIPERMAIL_MESSAGE_FILE)$/o) 
    {
	return "is Pipermail's index file! skipped."; # error
    } 

    pipermail_filter($contref, $weighted_str, $fields);
    html::html_filter($contref, $weighted_str, $fields, $headings);

    $$contref =~ s/^\s+//;
    mailnews::uuencode_filter($contref);
    mailnews::mailnews_filter($contref, $weighted_str, $fields);
    mailnews::mailnews_citation_filter($contref, $weighted_str);

    gfilter::line_adjust_filter($contref);
    gfilter::line_adjust_filter($weighted_str);
    gfilter::white_space_adjust_filter($contref);
    gfilter::show_filter_debug_info($contref, $weighted_str,
			   $fields, $headings);
    return undef;
}

# Assume a normal message files by pipermail (mailman edition)
sub pipermail_filter ($$) {
    my ($contref, $weighted_str, $fields) = @_;
    my $pipermail_fields = { };

    my $pos = index($$contref, '<!--beginarticle-->');
    if ($pos > 0) {
	load_pipermail_fields($pipermail_fields, $weighted_str,
			      substr($$contref, 0, $pos));
    }

    # Strip off front-matter
    if ( $pipermail_fields->{'title'} ) {
	my $title = $pipermail_fields->{'title'};
	$$contref =~ s/(<TITLE>).*(<\/TITLE>)/$1$title$2/si;
    }
    $$contref =~ s/(<\/TITLE>).*(<\/HEAD>)/$1$2/si;
    $$contref =~ s/<H1>.*<!--beginarticle-->//si;

    # Strip off end-matter
    $$contref =~ s/<!--endarticle-->.*//s;

    # Move pipermail fields to the filter fields
    my($fld_name, $fld_value);
    while (($fld_name, $fld_value) = each %$pipermail_fields) {
	$fields->{$fld_name} = $fld_value;
    }

    # Return extract MHonArc fields
    #$pipermail_fields;
}

sub load_pipermail_fields {
    my $fields	     	= shift;
    my $weighted_str 	= shift;
    my $pipermail_head	= shift;

    if ($pipermail_head =~ 
		m{<h1>(.*?)</h1>\s*<b>(.*?)\s*</b>\s*<a
href=.*?>(.*?)\s*</a>\s*<br>\s*<i>(.*?)</i>}is) {
        if ($1) {
	    my $subject = uncommentize($1);
	    1  while ($subject =~
s/\A\s*(re|sv|fwd|fw)[\[\]\d]*[:>-]+\s*//i);
	    $subject =~ s/\A\s*\[[^\]]+\]\s*//;
	    $fields->{'title'} = $subject;
        }
        if ($2) {
	    my $from;
	    if ($3) {
                $from = uncommentize("$2 ($3)");
	    } else {
                $from = uncommentize($2);
	    }
            $fields->{'from'} = $from;
        }
        if ($4) {
	    $fields->{'date'} = uncommentize($4);
	}
    }
}

sub uncommentize {
    my($txt) = $_[0];
    $txt =~ s/&#(\d+);/pack("C",$1)/ge;
    $txt;
}
1;





More information about the Mailman-Users mailing list