#!/usr/bin/perl # newsgroups-m.pl - (configured to connect to U. Mannheim news server) # newsgroups.pl # Constructs Web contents pages that show the Usenet groups available on # a news server. # Writes static HTML pages with news:// links to the newsgroups on the # server. # For each top level group, lists the newsgroups in it on one or more # HTML pages, with some maximum number of groups listed per page. # If a top level group has too many groups to list on one page, builds # a sub-contents page that shows the first and last group on each # listing page for the top level group. # Rod Clark # v1.85 # Feb 6, 2001 # # now uses Socket module for better socket compatibility # v1.84 # Jan 7, 2001 # # Junk groups now match beginning of string (e.g. control*...), not exact # string. # Removed $IndexPageName as unnecessary. # v1.83 # November 21, 2000 # # Split MainContentsPage into MainContentsPagePath and MainContentsPageURL # to make it easier to use separate directories for each news server. # v1.82 # April 3, 2000 # # Revised how $FileAbbrev is used, and comments for it. # Added $IndexPageName. # v1.81 # March 21, 1998 # # Changed to news:// URLs from nntp:// for better compatiblity with various # browsers. # v1.8 # March 21, 1998 # # Constructs Web contents pages that show the Usenet groups available on # a news server. # Writes static HTML pages with links to nntp:// news URLs for browsers. # For each top level group, lists the groups and the number of messages # in each group on one or more HTML pages, with some maximum number # of groups listed per page. # If a top level group has too many groups to list on one page, builds # a sub-contents page that shows the first and last group on each # listing page for the top level group. # v1.6 # September 19, 1998 # # This version prints nntp:// URLs (allows posting to servers that allow # posting messages) instead of using readnews and readmsg to display # messages. (This doesn't have the search capability that readnews # includes.) # v1.5 # July 30, 1998 # # More configuration items. # v1.4 # July 29, 1998 # # Can write a local static active file. # Reading the group sizes individually is now optional. # Added some configuration items. # v1.3 # July 27, 1998 # # If the active file isn't available from the file system, connects to # the news server via NNTP and retrieves it. # v1.2 # July 24, 1998 # # Connects to the news server via NNTP and issues a GROUP command for # each group, to find accurate group sizes. (The active file is # unreliable for this, as it gives only the first and last article # numbers, and sometimes the first message is a leftover with an # old article number that leads to a wildly inacurate group size # estimate.) # v1.1 # July 20, 1998 #---------------------------------------------------------------- # Configuration #---------------------------------------------------------------- $ProviderName = "U. Mannheim"; $newshost = "news.uni-mannheim.de"; $WebPagesHostURL = "http://www.scn.org"; # no trailing slash # This is the directory where the script writes the Web menu pages. # After running the script, you can manually symlink the directory's # home page to the script-generated groups*.html page, if you wish. # Or you can maintain a separate home page for the directory, that # isn't automatically generated by the script. $WebPagesDirPath = "/web/usenet/mannheim/"; # initial/trailing slashes $WebPagesDirURL = "/usenet/mannheim/"; # initial/trailing slashes $PageExt = "html"; # add an abbreviation for remote servers - none for local server $FileAbbrev = "m"; # The above abbreviation is added to the filenames, to keep them # separate from the files that read from other news servers. For # example: the abbreviation "k" gives files named groups-k.html, # k-alt.html, etc. Unless you have only one set of files for one # server, you must specify a non-blank value for this. # ----- # If the news server's files are on the same Unix file system that this # script runs on, set this to 1. If not, set this to 0: $NewsServerIsOnThisFileSystem = 0; # If the above is set to 1, then you must set the path to the news # active file: $NewsActiveFile = "/usr/lib/news/active"; # Set the next value to 1 to read the group list from the news server via # NNTP. It should be 0 if you have are reading the active file from a local # news server. This must be set to 1 on the first run, if you aren't # connecting to a local news server for which you can read the active file. # Leave it set to 1 to refresh the contents pages from updated information # from the remote server. # # Set it to 0 to re-index the HTML pages without updating the active file, # when making local revisions such as changes in the maximum number of # groups listed per page or the news server's name: $ReadListFromRemoteServer = 0; # The maximum number of groups on a page might be # 50 for < 5,000 groups # 100 for < 10,000 groups # 150 for < 15,000 groups # 200 for < 20,000 groups # 250 for > 20,000 groups $MaxGroupsPerPage = 100; # avoid overrunning one line on contents subpages in Lynx $MaxContentsLineLength = 58; # Getting the group sizes individually for each group takes a very long # time (possibly 3 to 8 groups per second, or slower). Always set this # to 0, unless you have your own news server and it has only a small # number of groups on it. $OKToGetGroupSizes = 0; #---------------------------------------------------------------- # You shouldn't need to change anything below this line. #---------------------------------------------------------------- if ($FileAbbrev) { $GroupListPageBegin = "$FileAbbrev-"; $StaticActiveFile = "$FileAbbrev"."-active.txt"; $GroupNamesTextFile = "$FileAbbrev"."-groups.txt"; $TopLevelTextFile = "$FileAbbrev"."-toplevel.txt"; $CheckedGroupsTextFile = "$FileAbbrev"."-checked.txt"; # The main groups list page - a plain filename, not a complete path. $MainContentsPagePath = "groups-$FileAbbrev.html"; # $MainContentsPageURL can be "" if you symlink index.html to the # above $MainContentsPagePath file. # Otherwise, it should be the same as that file. $MainContentsPageURL = "groups-$FileAbbrev.html"; } else { $GroupListPageBegin = "news-"; $StaticActiveFile = "news-active.txt"; $GroupNamesTextFile = "news-groups.txt"; $TopLevelTextFile = "news-toplevel.txt"; $CheckedGroupsTextFile = "news-checked.txt"; # The main groups list page - a plain filename, not a complete path. $MainContentsPagePath = "groups-$FileAbbrev.html"; # $MainContentsPageURL can be "" if you symlink index.html to the # above $MainContentsPagePath file. # Otherwise, it should be the same as that file. $MainContentsPageURL = "groups-$FileAbbrev.html"; } $port = 119; $AF_INET = 2; $sockaddr = 'S n a4 x8'; use Socket; # Ignore these active file groups when writing group names: @JunkNames = ('control', 'cancel', 'junk'); #---------------------------------------------------------------- # End of configuration #---------------------------------------------------------------- $Date = `date`; print "start\n"; &GetGroupList; print "done\n"; if (!$StaticActiveFileUsed) { print "Writing local static active file..."; &PrintStaticActiveFile; print "done\n"; } &FindTopLevelGroupNames; if ($OKToGetGroupSizes) { print "Getting group sizes..."; &GetGroupSizes; print "done\n"; } print "Deleting previous list pages..."; &DeletePreviousGroupListPages; print "done\n"; print "Writing list pages..."; &PrintGroupListPages; print "done\n"; print "Writing contents page..."; &PrintContentsPage; print "done\n"; # Writing these text files is optional. The program doesn't use them. # print "Writing other text files..."; # &PrintGroupNamesTextFile; # &PrintTopLevelTextFile; # if ($OKToGetGroupSizes) # { # &PrintCheckedGroupsTextFile; # } print "done\n"; #---------------------------------------------------------------- # read data #---------------------------------------------------------------- sub GetGroupList { if ($ReadListFromRemoteServer) { if ($NewsServerIsOnThisFileSystem) { if ((-e $NewsActiveFile) && (-r $NewsActiveFile) && !(-z $NewsActiveFile)) { print "Getting group list from news active file..."; &GetGroupListFromActiveFile ($NewsActiveFile); } else { print "Getting group list via NNTP..."; &GetGroupListFromNNTP; } } else { print "Getting group list via NNTP..."; &GetGroupListFromNNTP; } } else { if ($NewsServerIsOnThisFileSystem) { print "Getting group list from news active file..."; &GetGroupListFromActiveFile ($NewsActiveFile); } else { if ((-e $StaticActiveFile) && (-r $StaticActiveFile) && !(-z $StaticActiveFile)) { print "Getting group list from static active file..."; $StaticActiveFileUsed = 1; &GetGroupListFromActiveFile ($StaticActiveFile); } else { print "\n\nNOTE: No local static active file exists for $ProviderName.\n"; print "--> Set the variable \$ReadGroupListFromRemoteServer = 1;\n\n"; exit; } } } } sub GetGroupListFromActiveFile { local ($ThisListFile) = @_; if (!open (ACTIVE, "$ThisListFile")) { print "Can't read news active file: $ThisListFile\n"; exit; } @ActiveLines = ; close (ACTIVE); @ActiveLines = sort (@ActiveLines); foreach $ActiveLine (@ActiveLines) { ($GroupName, $EndMessageNumber, $StartMessageNumber, $StatusCode) = split (/\s/, $ActiveLine, 4); $GroupName = $ActiveLine; chop ($GroupName); if (($GroupName =~ /\S/) && ($GroupName =~ /\./)) { $IsJunk = 0; foreach $JunkName (@JunkNames) { if ($GroupName =~ /^$JunkName/) { $IsJunk = 1; } } if (!$IsJunk) { $NumberOfMessages = ($EndMessageNumber - $StartMessageNumber) + 1; if ($NumberOfMessages < 0) { $NumberOfMessages = 0; } # push (@GroupLines, "$GroupName $NumberOfMessages"); push (@GroupLines, "$GroupName"); } } } } sub GetGroupListFromNNTP { &ConnectToNewsServer; $ListStatus = &CheckListStatus; if ($ListStatus != 215) { $ErrorMessage = "$ErrorMessage"."\nError: List command rejected ($status)"; } else { while () { last if ($_ eq ".\r\n"); $_ =~ s/\r//g; push (@ActiveLines, $_); ($GroupName, $EndMessageNumber, $StartMessageNumber, $StatusCode) = split (/\s/, $_, 4); if (($GroupName =~ /\S/) && ($GroupName =~ /\./)) { $IsJunk = 0; foreach $JunkName (@JunkNames) { if ($GroupName =~ /^$JunkName/) { $IsJunk = 1; } } if (!$IsJunk) { $NumberOfMessages = ($EndMessageNumber - $StartMessageNumber) + 1; if ($NumberOfMessages < 0) { $NumberOfMessages = 0; } push (@GroupLines, "$GroupName $NumberOfMessages"); } } } if ($_ ne ".\r\n") { print "Unexpected EOF on socket\n"; } @GroupLines = sort (@GroupLines); } &QuitNewsServer; } sub FindTopLevelGroupNames { ($TopLevelName, $Junk) = split (/\./, $GroupLines[0], 2); push (@TopLevelNames, $TopLevelName); $CurrentTopLevelName = $TopLevelName; foreach $GroupLine (@GroupLines) { ($TopLevelName, $Junk) = split (/\./, $GroupLine, 2); if ($TopLevelName ne $CurrentTopLevelName) { push (@TopLevelNames, $TopLevelName); $CurrentTopLevelName = $TopLevelName; } } } sub GetGroupSizes { &ConnectToNewsServer; $NumberOfGroupLines = @GroupLines; $GroupIndex = 0; while ($GroupIndex < $NumberOfGroupLines) { ($GroupName, $Junk) = split (/ /, $GroupLines[$GroupIndex], 2); $GroupStatus = &CheckGroupSize ($GroupName); if ($GroupStatus == 211) { push (@CheckedGroupLines, "$GroupName $gmany"); $GroupIndex++; } elsif ($GroupStatus == 400) { print "-\n"; sleep (2); } print "$GroupIndex\n"; } &QuitNewsServer; @GroupLines = @CheckedGroupLines; } #---------------------------------------------------------------- # write text files #---------------------------------------------------------------- sub PrintStaticActiveFile { if (!open (TEXTFILE, ">$WebPagesDirPath$StaticActiveFile")) { print "Can't write local active file: $WebPagesDirPath$StaticActiveFile\n"; exit; } foreach $ActiveLine (@ActiveLines) { print TEXTFILE "$ActiveLine"; } close (TEXTFILE); chmod (0664, "$WebPagesDirPath$StaticActiveFile"); } sub PrintGroupNamesTextFile { # print alphabetical list of all group names, as a text file if (!open (TEXTFILE, ">$WebPagesDirPath$GroupNamesTextFile")) { print "Can't write active groups text file: $WebPagesDirPath$GroupNamesTextFile\n"; exit; } foreach $GroupLine (@GroupLines) { # ($GroupName, $Junk) = split (/ /, $GroupLine, 2); $GroupName = $GroupLine; print TEXTFILE "$GroupName\n"; } close (TEXTFILE); chmod (0664, "$WebPagesDirPath$GroupNamesTextFile"); } sub PrintTopLevelTextFile { if (!open (TOPFILE, ">$WebPagesDirPath$TopLevelTextFile")) { print "Can't write top level groups text file: $WebPagesDirPath$TopLevelTextFile\n"; exit; } foreach $TopLevelName (@TopLevelNames) { print TOPFILE "$TopLevelName\n"; } close (TOPFILE); chmod (0664, "$WebPagesDirPath$TopLevelTextFile"); } sub PrintCheckedGroupsTextFile { if (!open (CHKFILE, ">$WebPagesDirPath$CheckedGroupsTextFile")) { print "Can't write checked groups text file: $WebPagesDirPath$CheckedGroupsTextFile\n"; exit; } foreach $CheckedGroupLine (@CheckedGroupLines) { print CHKFILE "$CheckedGroupLine\n"; } close (CHKFILE); chmod (0664, "$WebPagesDirPath$CheckedGroupsTextFile"); } #---------------------------------------------------------------- # write Web files #---------------------------------------------------------------- sub DeletePreviousGroupListPages { opendir (USENET, "$WebPagesDirPath"); while ($Filename = readdir (USENET)) { push (@DirList, $Filename); } closedir (USENET); foreach $Filename (@DirList) { if (($Filename =~ /^$GroupListPageBegin/) && ($Filename =~ /$PageExt$/)) { unlink ("$Filename"); } } } sub PrintGroupListPages { $CurrentTopLevelName = $TopLevelNames[0]; foreach $GroupLine (@GroupLines) { ($TopLevelName, $Junk) = split (/\./, $GroupLine, 2); if ($TopLevelName eq $CurrentTopLevelName) { push (@CurrentGroupLines, $GroupLine); } else { &PrintThisTopLevelGroup; $CurrentTopLevelName = $TopLevelName; undef @CurrentGroupLines; push (@CurrentGroupLines, $GroupLine); } } &PrintThisTopLevelGroup; } sub PrintThisTopLevelGroup { $NumberOfCurrentGroups = @CurrentGroupLines; if ($NumberOfCurrentGroups > $MaxGroupsPerPage) { &PrintBigTopLevelGroup; } else { &PrintSmallTopLevelGroup; } } sub PrintSmallTopLevelGroup { $GroupListPagePath = "$WebPagesDirPath$GroupListPageBegin"."$CurrentTopLevelName"."\.$PageExt"; if (!open (PAGEFILE, ">$GroupListPagePath")) { print "Can't write group list page: $GroupListPagePath\n"; exit; } &PrintListPageTop ($CurrentTopLevelName); foreach $CurrentGroupLine (@CurrentGroupLines) { # $GroupName = $CurrentGroupLine; # use only if number if messages not present ($GroupName, $NumberOfMessages) = split (/ /, $CurrentGroupLine, 2); print PAGEFILE "$GroupName
\n"; } &PrintPageFooter; close (PAGEFILE); chmod (0664, "$GroupListPagePath"); } sub PrintBigTopLevelGroup { $NumberOfBatches = int ($NumberOfCurrentGroups / $MaxGroupsPerPage); if (($NumberOfCurrentGroups % $MaxGroupsPerPage) > 0) { $NumberOfBatches++; } $GroupsPerBatch = int ($NumberOfCurrentGroups / $NumberOfBatches); if (($NumberOfCurrentGroups % $NumberOfBatches) > 0) { $GroupsPerBatch++; } $BatchNumber = 1; $BatchLineCounter = 0; foreach $CurrentGroupLine (@CurrentGroupLines) { ($GroupName, $NumberOfMessages) = split (/ /, $CurrentGroupLine, 2); push (@BatchGroupNames, $GroupName); push (@BatchNumberOfMessages, $NumberOfMessages); $BatchLineCounter++; if ($BatchLineCounter == 1) { $FirstGroup = $GroupName; } elsif ($BatchLineCounter == $GroupsPerBatch) { $LastGroup = $GroupName; $BatchContentsLine = "$FirstGroup -- $LastGroup"; push (@BatchContentsLines, "$BatchContentsLine"); &PrintBigListPage; undef (@BatchGroupNames); undef (@BatchNumberOfMessages); $BatchNumber++; $BatchLineCounter = 0; } } # get name of last group on last page, if not a full page if (($BatchLineCounter > 0) && ($BatchLineCounter < $GroupsPerBatch)) { $LastGroup = $GroupName; $BatchContentsLine = "$FirstGroup -- $LastGroup"; push (@BatchContentsLines, $BatchContentsLine); &PrintBigListPage; undef (@BatchGroupNames); undef (@BatchNumberOfMessages); } &PrintSubContentsPage; undef (@BatchContentsLines); } sub PrintBigListPage { $GroupListPagePath = "$WebPagesDirPath$GroupListPageBegin"."$CurrentTopLevelName"."$BatchNumber"."\.$PageExt"; if (!open (PAGEFILE, ">$GroupListPagePath")) { print "Can't write large-group list page: $GroupListPagePath\n"; exit; } &PrintListPageTop ($BatchContentsLine); $NumberOfGroupsInBatch = @BatchGroupNames; for ($BatchIndex = 0; $BatchIndex < $NumberOfGroupsInBatch; $BatchIndex++) { $ListGroupName = $BatchGroupNames[$BatchIndex]; $ListNumberOfMessages = $BatchNumberOfMessages[$BatchIndex]; print PAGEFILE "$ListGroupName
\n"; } &PrintPageFooter; close (PAGEFILE); chmod (0664, "$GroupListPagePath"); } sub PrintSubContentsPage { # Print a Web sub-contents page with a link to each grouplist page, # for a big top-level group. $SubContentsPagePath = "$WebPagesDirPath$GroupListPageBegin"."$CurrentTopLevelName"."\.$PageExt"; if (!open (PAGEFILE, ">$SubContentsPagePath")) { print "Can't write large-group subcontents page: $SubContentsPagePath\n"; exit; } &PrintListPageTop ($CurrentTopLevelName); $BatchPageCounter = 1; foreach $BatchContentsLine (@BatchContentsLines) { $GroupListPageURL = "$WebPagesHostURL$WebPagesDirURL$GroupListPageBegin"."$CurrentTopLevelName"."$BatchPageCounter"."\.$PageExt"; print PAGEFILE "$BatchContentsLine
\n"; $BatchPageCounter++; } &PrintPageFooter; close (PAGEFILE); chmod (0664, "$SubContentsPagePath"); } sub PrintContentsPage { # Print a Web contents page with a link to each grouplist page. # The grouplist page is a sub-contents page if there are too many # groups for one grouplist page. if (!open (PAGEFILE, ">$WebPagesDirPath$MainContentsPagePath")) { print "Can't write main contents page: $WebPagesDirPath$MainContentsPagePath\n"; exit; } &PrintContentsPageTop; foreach $TopLevelName (@TopLevelNames) { $GroupListPageURL = "$WebPagesHostURL$WebPagesDirURL$GroupListPageBegin"."$TopLevelName"."\.$PageExt"; print PAGEFILE "$TopLevelName
\n"; } &PrintPageFooter; close (PAGEFILE); chmod (0664, "$WebPagesDirPath$MainContentsPagePath"); } sub PrintContentsPageTop { print PAGEFILE < SCN - Usenet: Newsgroups from $ProviderName    Usenet

Newsgroups on $ProviderName Server

Menus
ENDPRINT } sub PrintListPageTop { local ($PrintPageName) = @_; print PAGEFILE < SCN - Usenet: $PrintPageName    Usenet
     Newsgroups from $ProviderName

$PrintPageName

ENDPRINT } sub PrintPageFooter { print PAGEFILE <

Updated $Date - webmaster\@scn.org ENDPRINT } #---------------------------------------------------------------- # NNTP routines #---------------------------------------------------------------- sub ConnectToNewsServer { # get news host name ($name, $aliases, $proto) = getprotobyname ('tcp'); if ($port !~ /^\d+$/) { ($name, $aliases, $port) = getservbyname ($port, 'tcp'); } if ($newshost =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/) { $thataddr = pack ('C4', $1, $2, $3, $4); } else { if ($newshost =~ /(\w+)(\.\w+)*/) { ($name, $aliases, $type, $len, $thataddr) = gethostbyname ($newshost); } else { print "Error: NNTP host not specified in proper format\n"; exit 1; } } # get socket $this = pack ($sockaddr, $AF_INET, 0, $thisaddr); $that = pack ($sockaddr, $AF_INET, $port, $thataddr); if (!socket (S, $AF_INET, SOCK_STREAM, $proto)) { print "Error: socket failed $1\n"; exit 1; } # bind to socket if (!bind (S, $this)) { print "Error: bind to $this failed $1\n"; exit 1; } # connect to socket if (!connect (S, $that)) { print "Can't connect to $newshost tcp/ip port $port. Error: $1\n"; exit 1; } # buffer the socket select (S); $| = 1; select (STDOUT); # read server status $_ = ; ($status, $rest) = split (/ /, $_, 2); if ($status == 200 || $status == 201) { # 200 server ready - posting allowed # 201 server ready - no posting allowed } else { print "$newshost refused connection: status $status - $rest\n"; exit 0; } } sub CheckListStatus { print S "LIST\n"; # read reply $_ = ; ($status, $rest) = split (/ /, $_, 2); return ($status); } sub CheckGroupSize { local ($CheckGroupSizeGroup) = @_; print S "GROUP $CheckGroupSizeGroup\n"; # read reply $_ = ; ($status, $gmany, $firstnum, $lastnum, $groupname) = split (/ /, $_, 5); return ($status); } sub QuitNewsServer { print S "QUIT\n"; $_ = ; ($status, $rest) = split (/ /, $_, 2); if ($status != 205) { print "news server status=$status: $rest\n"; } return ($status); }