Changeset 6192


Ignore:
Timestamp:
01/30/10 10:24:04 (3 years ago)
Author:
MichaelTempest
Message:

Item8384: Give the option of extracting the HTML portion from emails.
The default behaviour is supposed to be unchanged (i.e. extract the text portion).

Location:
trunk/MailInContrib
Files:
1 added
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/MailInContrib/data/System/MailInContrib.txt

    r6085 r6192  
    6262!MailInContribUserTemplate and edit to taste. 
    6363 
     64---+ Which part of the mail is added 
     65 
     66The module defaults to extracts the plain-text portion of the mail; the HTML portion (if present) is discarded.  
     67When the module extracts the plain-text portion, it also discards inline images associated with the HTML (because, whilst they are part of the mail, they are not _attachments_). 
     68 
     69%X% *Experimental:* 
     70You can configure the module to extract the HTML portion instead, in which case the plain-text portion is discarded. The module then also extracts and attaches inline images. 
     71 
    6472---+ How the contributor is identified 
    6573The user identity is used for access control checks on the target topic, so you can use Foswiki access controls to protect target topics. 
     
    104112|  Release: | %$RELEASE% | 
    105113| Change History: | | 
     114| _not yet released_ | Foswikitask:Item8384: Support HTML email | 
    106115|  3 Feb 2009 | Foswikitask:Item456: ported Foswikitask:Item4955: config options moved to 'Mail and Proxies' Foswikitask:Item5829: force new revision on each submission Foswikitask:Item5843: extract target topic from CC: field - general improvement to rules for extracting the target topic | 
    107116|  10 Mar 2005 | 1.000 Initial version | 
  • trunk/MailInContrib/lib/Foswiki/Contrib/MailInContrib.pm

    r6085 r6192  
    147147    $box->{onSuccess}  ||= 'log'; 
    148148 
     149        # Copy the valid domain pattern for external resource URLs (img, script, style) 
     150        $this->{validUrlPattern} = $box->{validUrlPattern}; 
     151 
    149152    # Load the mail templates 
    150153    Foswiki::Func::loadTemplate('MailInContrib'); 
     
    296299                my $body        = ''; 
    297300 
    298                 _extract( $mail, \$body, \@attachments ); 
     301                                unless ($this->{validUrlPattern}) { 
     302                                        # Default to files attached to this wiki and files in the mail message 
     303                                        my $puburl = Foswiki::Func::expandCommonVariables( 
     304                                                '%PUBURL%', 
     305                                                $topic, $web 
     306                                        ); 
     307                                        $this->{validUrlPattern} = qr/cid:|\Q$puburl/; 
     308                                } 
     309 
     310                $this->_extract( $mail, \$body, \@attachments, $box ); 
    299311 
    300312                print "Received mail from $sender for $web.$topic\n"; 
     
    337349            matching => sub { 
    338350                my $test = shift; 
    339                 if ( defined $kill{ $test->header('Message-ID') } ) { 
    340                     print STDERR "Delete ", $test->header('Message-ID'), "\n" 
     351                                my $message_id = $test->header('Message-ID'); 
     352                if ( defined $message_id and defined $kill{ $message_id } ) { 
     353                    print STDERR "Delete $message_id\n" 
    341354                      if $this->{debug}; 
    342355                    return 1; 
     
    368381} 
    369382 
     383sub _extract { 
     384    my ( $this, $mime, $text, $attach, $box ) = @_; 
     385        $box->{content}{type} ||= ''; 
     386    if ($box->{content}{type} =~ /html/) { 
     387        $this->_extractHtmlAndAttachments($mime, $text, $attach, $box->{content}); 
     388    } 
     389    else { 
     390        _extractPlainTextAndAttachments($mime, $text, $attach); 
     391    } 
     392} 
     393 
     394sub _extractHtmlAndAttachments { 
     395    my ( $this, $mime, $text, $attach, $options ) = @_; 
     396        my $ct = $mime->content_type || 'text/plain'; 
     397    my $dp = $mime->header('Content-Disposition') || 'inline'; 
     398        print STDERR "\nContent-type: $ct\n" if $this->{debug}; 
     399        if ($ct =~ m[multipart/mixed]) { 
     400                $this->_extractMultipartMixed($mime, $text, $attach, $options); 
     401        } 
     402        elsif ($ct =~ m[multipart/alternative]) { 
     403                $this->_extractMultipartAlternative($mime, $text, $attach, $options); 
     404        } 
     405        elsif ( $ct =~ m[multipart/related] ) { 
     406                my $found; 
     407                $found = _extractMultipartHtml($mime, $text, $attach, $options); 
     408                print STDERR "Found multipart/related HTML\n" if $found and $this->{debug}; 
     409                if (not $found) 
     410                { 
     411                        print STDERR "Cannot find HTML. Extracting plain text\n" if $this->{debug}; 
     412                        _extractPlainTextAndAttachments($mime, $text, $attach); 
     413                } 
     414        } 
     415        elsif ( $ct =~ m[text/html] and $dp =~ /inline/ ) { 
     416                print STDERR "Extracting text/html\n" if $this->{debug}; 
     417                _extractPlainHtml($mime, $text, $options); 
     418        } 
     419        else { 
     420                print STDERR "Extracting plain text and attachments\n" if $this->{debug}; 
     421                _extractPlainTextAndAttachments($mime, $text, $attach); 
     422        } 
     423} 
     424 
     425sub _extractMultipartMixed { 
     426    my ( $this, $mime, $text, $attach, $options ) = @_; 
     427    foreach my $part ( grep { $_ != $mime } $mime->parts() ) { 
     428                print STDERR "Multipart/mixed: Recursing\n" if $this->{debug}; 
     429                $this->_extractHtmlAndAttachments($part, $text, $attach, $options); 
     430        } 
     431} 
     432 
     433sub _extractMultipartAlternative { 
     434    my ( $this, $mime, $text, $attach, $options ) = @_; 
     435 
     436        print STDERR "Multipart/alternative\n" if $this->{debug}; 
     437        # See what alternatives are available 
     438        my @alternates = map +{  
     439                mime => $_,  
     440                ct => $_->content_type || 'text/plain',  
     441          }, grep { $_ != $mime } $mime->parts(); 
     442 
     443        my ($multipartRelatedAlternate) = grep { $_->{ct} =~ m[multipart/related] } @alternates; 
     444        my ($htmlAlternate) = grep { $_->{ct} =~ m[text/html] } @alternates; 
     445 
     446        # Pick one 
     447        my $found; 
     448        if ($multipartRelatedAlternate and $options->{type} !~ /plain/) { 
     449                $found = $this->_extractMultipartHtml($multipartRelatedAlternate->{mime}, $text, $attach, $options); 
     450                print STDERR "Found multipart/related HTML\n" if $found and $this->{debug}; 
     451        } 
     452        if ($htmlAlternate and not $found) { 
     453                $found = $this->_extractPlainHtml($htmlAlternate->{mime}, $text, $options); 
     454                print STDERR "Found text/html\n" if $found and $this->{debug}; 
     455        } 
     456        if (not $found) 
     457        { 
     458                print STDERR "Cannot find HTML - Extracting plain text\n" if $this->{debug}; 
     459                _extractPlainTextAndAttachments($mime, $text, $attach); 
     460        } 
     461} 
     462 
     463sub _extractMultipartHtml { 
     464    my ( $this, $mime, $text, $attach, $options ) = @_; 
     465        my @bits = map +{  
     466                mime => $_,  
     467                ct => $_->content_type || 'text/plain',  
     468        dp => $_->header('Content-Disposition') || 'inline' 
     469          }, grep { $_ != $mime } $mime->parts(); 
     470        my ($htmlBit) = grep { $_->{ct} =~ m[text/html] and $_->{dp} =~ /inline/ } @bits; 
     471        return unless $htmlBit; # Not found 
     472 
     473        my $html = $this->_extractAndTrimHtml($htmlBit->{mime}); 
     474        return unless $html; 
     475    for my $bit (grep { $_ != $htmlBit } @bits) 
     476        { 
     477                my $filename = $bit->{mime}->filename(); 
     478                ($filename) = Foswiki::Sandbox::sanitizeAttachmentName( $bit->{mime}->filename() ) if defined $filename; 
     479                my $cid = $bit->{mime}->header('Content-ID') || ''; 
     480                my $cid_used = ''; 
     481                print STDERR "cid:[$cid]\n" if $cid and $this->{debug}; 
     482                if ($cid =~ /^\s*<?((.*?)\@.*?)>?\s*$/) { 
     483                        $cid = $1; 
     484                        ($filename) = Foswiki::Sandbox::sanitizeAttachmentName($2); 
     485                        $cid_used = ($html =~ s{"cid:\Q$cid\E"}{"%ATTACHURLPATH%/$filename"}); 
     486                } 
     487                if ( $filename and ($bit->{dp} !~ /inline/ or ($cid and $cid_used) ) ) { 
     488                        push( 
     489                                @$attach, 
     490                                { 
     491                                        payload  => $bit->{mime}->body(), 
     492                                        filename => $filename 
     493                                } 
     494                        ); 
     495                } 
     496        } 
     497        $$text .= "<literal><div class=\"foswikiMailInContribHtml\">$html</div></literal>\n"; 
     498        return 1; 
     499} 
     500 
     501sub _extractPlainHtml { 
     502    my ( $this, $mime, $text, $options ) = @_; 
     503        my $html = $this->_extractAndTrimHtml($mime); 
     504        return unless $html; 
     505        $$text .= "<literal><div class=\"foswikiMailInContribPlainHtml\">$html</div></literal>\n"; 
     506        return 1; 
     507} 
     508 
     509sub _extractAndTrimHtml { 
     510    my ($this, $mime) = @_; 
     511        return unless $mime; 
     512        my $html = $mime->body(); 
     513        return unless $html; 
     514 
     515        # Remove anything outside the body tag, and change the body tag into a div tag 
     516        # It is better to keep the body tag as a tag (and not just discard it altogether) 
     517        # because that tag sometimes has attributes that should be retained. 
     518        $html =~ s{.*<body([^>]*>.*)</body>.*}{<div$1</div>}is; 
     519 
     520        # Remove tags that point to external sites 
     521        my $validUrlPattern = $this->{validUrlPattern}; 
     522        $html =~ s{<(script|style|img)          # opening tag 
     523                       [^>]+                        # whitespace or attributes 
     524                           \bsrc=                       # attribute that contains a URL that could be used as e.g. a webbug 
     525                           (['"])                       # opening quote 
     526                           (?!$validUrlPattern)         # Zero-width negative lookahead for valid URLs 
     527                                                        # URLs that don't match this pattern might be evil 
     528                           [^>]+?                       # the URL itself 
     529                           \2                           # closing quote that matches the opening quote 
     530                           [^>]*                        # Any other attributes or whitespace 
     531                           (?: 
     532                             >.*?</\1>                  # End of tag, content, and closing tag 
     533                           |                            #   or 
     534                                 />                         # End of tag, and tag does not have content 
     535                           ) 
     536                          }{<em>External link removed</em>}isgx if $validUrlPattern; 
     537 
     538        return unless $html =~ /\S/; 
     539    return $html; 
     540} 
     541 
     542 
    370543# Extract plain text and attachments from the MIME 
    371 sub _extract { 
     544sub _extractPlainTextAndAttachments { 
    372545    my ( $mime, $text, $attach ) = @_; 
    373546 
     
    388561        } 
    389562        elsif ( $part != $mime ) { 
    390             _extract( $part, $text, $attach ); 
     563            _extractPlainTextAndAttachments( $part, $text, $attach ); 
    391564        } 
    392565    } 
  • trunk/MailInContrib/lib/Foswiki/Contrib/MailInContrib/Config.spec

    r6116 r6192  
    5656#    <li> error - treat this as an error (overrides all other options)</li> 
    5757#    <li> spam - save the mail in the spambox topic. 
    58 #    Note: if you clear this, then Foswiki will simply ignore the mail..</li> 
     58#    Note: if you clear this, then Foswiki will simply ignore the mail.</li> 
    5959#   </ul> 
    6060#  </li> 
    61 #  <li> spambox - optional required of onNoTopic = spam. Name of the topic 
     61#  <li> spambox - optional, required if onNoTopic = spam. Name of the topic 
    6262#   where you want to save mails that don't have a valid web.topic. You must 
    6363#   specify a full web.topicname 
     64#  </li> 
     65#  <li> content - optional, defaults to "extract plain text".  
     66#   Specifies what part of the mail to extract and how to process it. 
     67#   It takes a number of fields: 
     68#   <ul> 
     69#    <li> type - specifies type of content to extract.  
     70#       Available options: 
     71#     <ul> 
     72#      <li> text - extract the plain-text portion </li> 
     73#      <li> html - extract the HTML portion, by preference  
     74#        - reverts to the plain-text if the mail does not contain HTML  
     75#      </li> 
     76#     </ul> 
     77#    </li> 
     78#   </ul> 
    6479#  </li> 
    6580# </ul> 
  • trunk/MailInContrib/test/unit/MailInContrib/MailInContribSuite.pm

    r6149 r6192  
    88sub name { 'MailInContribSuite' } 
    99 
    10 sub include_tests { qw(MailInContribTests) } 
     10sub include_tests { qw(MailInContribTests MailInContribMimeTests) } 
    1111 
    12121; 
Note: See TracChangeset for help on using the changeset viewer.