system/archmage: Added (CHM to HTML/PDF/text converter)

Signed-off-by: Willy Sudiarto Raharjo <willysr@slackbuilds.org>
This commit is contained in:
B. Watson 2022-01-10 23:19:23 -05:00 committed by Willy Sudiarto Raharjo
parent 725c8bbf4a
commit c218f52197
No known key found for this signature in database
GPG key ID: 3F617144D7238786
7 changed files with 566 additions and 0 deletions

4
system/archmage/README Normal file
View file

@ -0,0 +1,4 @@
archmage (CHM to HTML/PDF/text converter)
arCHMage converts CHM files to HTML, plain text, or PDF. CHM is the
format used by Microsoft HTML Help, also known as Compiled HTML.

View file

@ -0,0 +1,289 @@
diff --git a/.gitignore b/.gitignore
index 3768c97..d88251b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
build/
dist/
*.pyc
+.eggs
diff --git a/.travis.yml b/.travis.yml
index b94c4dc..8770821 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,4 +1,7 @@
os: linux
+arch:
+ - amd64
+ - ppc64le
addons:
apt:
update: true
@@ -6,8 +9,8 @@ addons:
- libchm-dev
language: python
python:
- - "3.5"
- "3.6"
- "3.7"
- "3.8"
+ - "3.9"
script: scripts/travis-run.sh
diff --git a/AUTHORS b/AUTHORS
index 23e621b..d36c3ec 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,3 +1,3 @@
Copyright (c) 2003 Eugeny Korekin <az@ftc.ru>
Copyright (c) 2005-2009 Basil Shubin <basil.shubin@gmail.com>
-Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net>
+Copyright (c) 2015-2020 Misha Gusarov <dottedmag@dottedmag.net>
diff --git a/README.md b/README.md
index ec2b4e1..b14cec2 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,8 @@ This feature requires `htmldoc(1)`, and `lynx(1)` or `elinks(1)` installed.
Installation
============
+Archmage uses PyCHM that depends on (C library) CHMlib. After CHMlib is installed, do
+
pip install archmage
Requirements
@@ -40,7 +42,7 @@ Requirements
arCHMage has the following dependencies:
- * Python 3.5+
+ * Python 3.6+
* PyCHM
* BeautifulSoup4
diff --git a/archmage/CHM.py b/archmage/CHM.py
index ce85446..44bbd98 100644
--- a/archmage/CHM.py
+++ b/archmage/CHM.py
@@ -3,7 +3,7 @@
# archmage -- CHM decompressor
# Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net>
# Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net>
-# Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net>
+# Copyright (c) 2015-2020 Misha Gusarov <dottedmag@dottedmag.net>
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -29,6 +29,7 @@ import string
import tempfile
import os.path
from enum import Enum
+from typing import List, Union
import archmage
@@ -36,7 +37,7 @@ from archmage.CHMParser import SitemapFile, PageLister, ImageCatcher, TOCCounter
# import PyCHM bindings
try:
- from chm import chmlib
+ from chm import chmlib # type: ignore
except ImportError as msg:
sys.exit(
"ImportError: %s\nPlease check README file for system requirements."
@@ -70,7 +71,7 @@ class FileSource:
out.append(path)
return chmlib.CHM_ENUMERATOR_CONTINUE
- out = []
+ out: List[str] = []
if (
chmlib.chm_enumerate(
self._chm, chmlib.CHM_ENUMERATE_ALL, get_name, out
@@ -123,7 +124,7 @@ class CHM:
self.cache = {}
# Name of source directory with CHM content
if os.path.isdir(name):
- self.source = DirSource(name)
+ self.source: Union[DirSource, FileSource] = DirSource(name)
else:
self.source = FileSource(name)
self.sourcename = name
@@ -177,13 +178,14 @@ class CHM:
return self.cache["image_urls"]
def _image_urls(self):
- out = []
+ out: List[str] = []
image_catcher = ImageCatcher()
for file in self.html_files():
+ # Use latin-1, as it will accept any byte sequences
image_catcher.feed(
Entry(
self.source, file, self.filename_case, self.restore_framing
- ).correct()
+ ).correct().decode("latin-1")
)
for image_url in image_catcher.imgurls:
if not out.count(image_url):
@@ -273,7 +275,8 @@ class CHM:
def _toclevels(self):
counter = TOCCounter()
- counter.feed(self.topicstree)
+ # Use latin-1, as it will accept any byte sequences
+ counter.feed(self.topicstree.decode("latin-1"))
if counter.count > self.maxtoclvl:
return self.maxtoclvl
else:
@@ -432,7 +435,7 @@ class CHM:
self.extract_entry(
entry=key, output_file=key.lower(), destdir=tempdir
)
- htmldoc(files, self.htmldoc_exec, options, self.toclevels, output)
+ htmldoc(files, self.htmldoc_exec, options, self.toclevels(), output)
# Remove temporary files
shutil.rmtree(path=tempdir)
@@ -493,21 +496,21 @@ if (window.name != "content")
data = self.lower_links(data)
# Delete unwanted HTML elements.
- data = re.sub("<div .*teamlib\\.gif.*\\/div>", "", data)
- data = re.sub("<a href.*>\\[ Team LiB \\]<\\/a>", "", data)
+ data = re.sub(b"<div .*teamlib\\.gif.*\\/div>", b"", data)
+ data = re.sub(b"<a href.*>\\[ Team LiB \\]<\\/a>", b"", data)
data = re.sub(
- "<table.*larrow\\.gif.*rarrow\\.gif.*<\\/table>", "", data
+ b"<table.*larrow\\.gif.*rarrow\\.gif.*<\\/table>", b"", data
)
- data = re.sub("<a href.*next\\.gif[^>]*><\\/a>", "", data)
- data = re.sub("<a href.*previous\\.gif[^>]*><\\/a>", "", data)
- data = re.sub("<a href.*prev\\.gif[^>]*><\\/a>", "", data)
- data = re.sub('"[^"]*previous\\.gif"', '""', data)
- data = re.sub('"[^"]*prev\\.gif"', '""', data)
- data = re.sub('"[^"]*next\\.gif"', '""', data)
+ data = re.sub(b"<a href.*next\\.gif[^>]*><\\/a>", b"", data)
+ data = re.sub(b"<a href.*previous\\.gif[^>]*><\\/a>", b"", data)
+ data = re.sub(b"<a href.*prev\\.gif[^>]*><\\/a>", b"", data)
+ data = re.sub(b'"[^"]*previous\\.gif"', b'""', data)
+ data = re.sub(b'"[^"]*prev\\.gif"', b'""', data)
+ data = re.sub(b'"[^"]*next\\.gif"', b'""', data)
if data is not None:
return data
else:
- return ""
+ return b""
def get(self):
"""Get CHM entry content"""
@@ -524,4 +527,4 @@ if (window.name != "content")
if data is not None:
return data
else:
- return ""
+ return b""
diff --git a/archmage/CHMParser.py b/archmage/CHMParser.py
index 1ac1e2b..02c8c37 100644
--- a/archmage/CHMParser.py
+++ b/archmage/CHMParser.py
@@ -2,7 +2,7 @@
#
# archmage -- CHM decompressor
# Copyright (c) 2009 Basil Shubin <bashu@users.sourceforge.net>
-# Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net>
+# Copyright (c) 2015-2020 Misha Gusarov <dottedmag@dottedmag.net>
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -21,9 +21,10 @@
import re
import mimetypes
-import sgmllib, urllib.request, urllib.error, urllib.parse
+import sgmllib # type: ignore
+import urllib.request, urllib.error, urllib.parse
-from bs4 import BeautifulSoup, UnicodeDammit
+from bs4 import BeautifulSoup, UnicodeDammit # type: ignore
from html.parser import HTMLParser
from urllib.parse import urlparse
diff --git a/archmage/__init__.py b/archmage/__init__.py
index 8f1d5c5..804becf 100644
--- a/archmage/__init__.py
+++ b/archmage/__init__.py
@@ -3,7 +3,7 @@
# archmage -- CHM decompressor
# Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net>
# Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net>
-# Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net>
+# Copyright (c) 2015-2020 Misha Gusarov <dottedmag@dottedmag.net>
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
diff --git a/archmage/arch.conf b/archmage/arch.conf
index bb5432a..c9208a4 100644
--- a/archmage/arch.conf
+++ b/archmage/arch.conf
@@ -56,7 +56,7 @@ chmtohtml = '-t html -f "%(output)s" --book %(toc)s --no-numbered --toctitle "Ta
# CHM2PDF converting. Use following command to convert CHM content to a single
# PDF file. Make sure that htmldoc is available on your system.
-chmtopdf = '-t pdf14 -f "%(output)s" --book %(toc)s --no-numbered --toctitle "Table of Contents" --title --textcolor "#000000" --linkcolor "#0000ff" --linkstyle plain --size Universal --left 1.00in --right 0.50in --top 0.50in --bottom 0.50in --header .t. --header1 ... --footer h.1 --nup 1 --tocheader .t. --tocfooter ..i --portrait --color --no-pscommands --no-xrxcomments --compression=1 --jpeg=0 --fontsize 11.0 --fontspacing 1.2 --headingfont Helvetica --bodyfont Times --headfootsize 11.0 --headfootfont Helvetica --charset iso-8859-1 --links --embedfonts --pagemode outline --pagelayout single --firstpage c1 --pageeffect none --pageduration 10 --effectduration 1.0 --no-encryption --permissions all --owner-password "" --user-password "" --browserwidth 680 --no-strict --no-overflow --quiet'
+chmtopdf = '-t pdf14 -f "%(output)s" --webpage %(toc)s --no-title --no-numbered --toctitle "Table of Contents" --textcolor "#000000" --linkcolor "#0000ff" --linkstyle plain --size Universal --left 1.00in --right 0.50in --top 0.50in --bottom 0.50in --header .t. --header1 ... --footer h.1 --nup 1 --tocheader .t. --tocfooter ..i --portrait --color --no-pscommands --no-xrxcomments --compression=1 --jpeg=0 --fontsize 11.0 --fontspacing 1.2 --headingfont Helvetica --bodyfont Times --headfootsize 11.0 --headfootfont Helvetica --charset iso-8859-1 --links --embedfonts --pagemode outline --pagelayout single --firstpage c1 --pageeffect none --pageduration 10 --effectduration 1.0 --no-encryption --permissions all --owner-password "" --user-password "" --browserwidth 680 --no-strict --no-overflow --quiet'
# Maximum Table of Content levels for htmldoc utility.
#
diff --git a/archmage/cli.py b/archmage/cli.py
index a7fd54a..8a573f7 100755
--- a/archmage/cli.py
+++ b/archmage/cli.py
@@ -3,7 +3,7 @@
# archmage -- CHM decompressor
# Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net>
# Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net>
-# Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net>
+# Copyright (c) 2015-2020 Misha Gusarov <dottedmag@dottedmag.net>
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
diff --git a/archmage/htmldoc.py b/archmage/htmldoc.py
index 606fea1..b223dfd 100644
--- a/archmage/htmldoc.py
+++ b/archmage/htmldoc.py
@@ -21,7 +21,6 @@
"""Generic converter function"""
import os
-import string
import tempfile
import subprocess
@@ -42,10 +41,10 @@ def htmldoc(input, cmd, options, toclevels, output):
options = options % {"output": output, "toc": toc}
if input:
# Create a htmldoc file for batch processing
- f = tempfile.NamedTemporaryFile(delete=False)
- f.write("#HTMLDOC 1.8.27\n")
- f.write(options + "\n")
- f.write(string.join(input, "\n"))
+ f = tempfile.NamedTemporaryFile(mode="wb", delete=False)
+ f.write(b"#HTMLDOC 1.8.27\n")
+ f.write(options.encode("utf-8") + b"\n")
+ f.write(b'\n'.join(f.encode('utf-8') for f in input))
f.close()
# Prepare command line to execute
command = "%s --batch %s" % (cmd, f.name)
diff --git a/setup.py b/setup.py
index 630a675..092372d 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@ setup(
name="archmage",
version="0.4.2.1",
description="CHM decompressor",
- maintainer="Mikhail Gusarov",
+ maintainer="Misha Gusarov",
maintainer_email="dottedmag@dottedmag.net",
url="https://github.com/dottedmag/archmage",
license="GPLv2+",

View file

@ -0,0 +1,93 @@
.\" Man page generated from reStructuredText.
.
.TH ARCHMAGE 1 "2020-11-20" "0.4.2.1" "SlackBuilds.org"
.SH NAME
archmage \- convert CHM to PDF, HTML, or plain text
.
.nr rst2man-indent-level 0
.
.de1 rstReportMargin
\\$1 \\n[an-margin]
level \\n[rst2man-indent-level]
level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
-
\\n[rst2man-indent0]
\\n[rst2man-indent1]
\\n[rst2man-indent2]
..
.de1 INDENT
.\" .rstReportMargin pre:
. RS \\$1
. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
. nr rst2man-indent-level +1
.\" .rstReportMargin post:
..
.de UNINDENT
. RE
.\" indent \\n[an-margin]
.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
.nr rst2man-indent-level -1
.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
..
.\" RST source for archmage(1) man page. Convert with:
.
.\" rst2man.py archmage.rst > archmage.1
.
.\" rst2man.py comes from the SBo development/docutils package.
.
.SH SYNOPSIS
.sp
archmage [\fI\-options\fP] \fBchmfile\fP [\fBdestdir\fP | \fBdestfile\fP]
.SH DESCRIPTION
.sp
\fBarchmage\fP converts CHM files to HTML, plain text and PDF. CHM is the
format used by Microsoft HTML Help, also known as Compiled HTML.
.SH OPTIONS
.INDENT 0.0
.TP
.B \-x\fP,\fB \-\-extract
Extracts CHM file into specified directory. If destination
directory is omitted, then a new one will be created based
on the name of the CHM file. This option is the default.
.TP
.BI \-c\fP,\fB \-\-convert\fB= format
Convert CHM file into specified file format. If destination
file is omitted, the output filename will be created based on
the name of the CHM file. Available formats:
.INDENT 7.0
.INDENT 3.5
.sp
.nf
.ft C
html \- Single HTML file
text \- Plain text file (uses lynx(1) or elinks(1))
pdf \- Adobe PDF
.ft P
.fi
.UNINDENT
.UNINDENT
.TP
.B \-d\fP,\fB \-\-dump
Dump HTML data from CHM file to standard output.
.TP
.B \-V\fP,\fB \-\-version
Print version number and exit.
.TP
.B \-h\fP,\fB \-\-help
Print help message and exit.
.UNINDENT
.SH COPYRIGHT
.sp
See the file /usr/doc/archmage\-0.4.2.1/COPYING for license information.
.SH AUTHORS
.sp
archmage was written by dottedmag.
.sp
This man page written for the SlackBuilds.org project
by B. Watson, and is licensed under the WTFPL.
.SH SEE ALSO
.sp
The archmage homepage: \fI\%https://github.com/dottedmag/archmage\fP
.\" Generated by docutils manpage writer.
.

View file

@ -0,0 +1,77 @@
#!/bin/bash
# Slackware build script for archmage
# Written by B. Watson (yalhcru@gmail.com)
# Licensed under the WTFPL. See http://www.wtfpl.net/txt/copying/ for details.
cd $(dirname $0) ; CWD=$(pwd)
PRGNAM=archmage
VERSION=${VERSION:-0.4.2.1}
BUILD=${BUILD:-1}
TAG=${TAG:-_SBo}
if [ -z "$ARCH" ]; then
case "$( uname -m )" in
i?86) ARCH=i586 ;;
arm*) ARCH=arm ;;
*) ARCH=$( uname -m ) ;;
esac
fi
if [ ! -z "${PRINT_PACKAGE_NAME}" ]; then
echo "$PRGNAM-$VERSION-$ARCH-$BUILD$TAG.$PKGTYPE"
exit 0
fi
TMP=${TMP:-/tmp/SBo}
PKG=$TMP/package-$PRGNAM
OUTPUT=${OUTPUT:-/tmp}
if [ "$ARCH" = "i586" ]; then
SLKCFLAGS="-O2 -march=i586 -mtune=i686"
LIBDIRSUFFIX=""
elif [ "$ARCH" = "i686" ]; then
SLKCFLAGS="-O2 -march=i686 -mtune=i686"
LIBDIRSUFFIX=""
elif [ "$ARCH" = "x86_64" ]; then
SLKCFLAGS="-O2 -fPIC"
LIBDIRSUFFIX="64"
else
SLKCFLAGS="-O2"
LIBDIRSUFFIX=""
fi
set -e
rm -rf $PKG
mkdir -p $TMP $PKG $OUTPUT
cd $TMP
rm -rf $PRGNAM-$VERSION
tar xvf $CWD/$PRGNAM-$VERSION.tar.gz
cd $PRGNAM-$VERSION
chown -R root:root .
find -L . -perm /111 -a \! -perm 755 -a -exec chmod 755 {} \+ -o \
\! -perm /111 -a \! -perm 644 -a -exec chmod 644 {} \+
# latest git as of 20220107. fixes PDF conversion.
patch -p1 < $CWD/a8f632dd.diff
python3 setup.py install --root $PKG
# man page written by SlackBuild author. Upstream has a man page, but
# it's a stub (doesn't list the options, etc).
mkdir -p $PKG/usr/man/man1
gzip -9c < $CWD/$PRGNAM.1 > $PKG/usr/man/man1/$PRGNAM.1.gz
mkdir -p $PKG/usr/doc/$PRGNAM-$VERSION
cp -a AUTHORS* COPYING* NEWS* README* $PKG/usr/doc/$PRGNAM-$VERSION
cat $CWD/$PRGNAM.SlackBuild > $PKG/usr/doc/$PRGNAM-$VERSION/$PRGNAM.SlackBuild
mkdir -p $PKG/install
cat $CWD/slack-desc > $PKG/install/slack-desc
cd $PKG
/sbin/makepkg -l y -c n $OUTPUT/$PRGNAM-$VERSION-$ARCH-$BUILD$TAG.${PKGTYPE:-tgz}

View file

@ -0,0 +1,10 @@
PRGNAM="archmage"
VERSION="0.4.2.1"
HOMEPAGE="https://github.com/dottedmag/archmage"
DOWNLOAD="https://github.com/dottedmag/archmage/archive/v0.4.2.1/archmage-0.4.2.1.tar.gz"
MD5SUM="af3b4393d5d8912ddf93d722725e9b70"
DOWNLOAD_x86_64=""
MD5SUM_x86_64=""
REQUIRES="pychm BeautifulSoup4 htmldoc python3-sgmllib3k"
MAINTAINER="B. Watson"
EMAIL="yalhcru@gmail.com"

View file

@ -0,0 +1,74 @@
.. RST source for archmage(1) man page. Convert with:
.. rst2man.py archmage.rst > archmage.1
.. rst2man.py comes from the SBo development/docutils package.
.. |version| replace:: 0.4.2.1
.. |date| date::
========
archmage
========
---------------------------------------
convert CHM to PDF, HTML, or plain text
---------------------------------------
:Manual section: 1
:Manual group: SlackBuilds.org
:Date: |date|
:Version: |version|
SYNOPSIS
========
archmage [*-options*] **chmfile** [**destdir** | **destfile**]
DESCRIPTION
===========
**archmage** converts CHM files to HTML, plain text and PDF. CHM is the
format used by Microsoft HTML Help, also known as Compiled HTML.
OPTIONS
=======
-x, --extract
Extracts CHM file into specified directory. If destination
directory is omitted, then a new one will be created based
on the name of the CHM file. This option is the default.
-c, --convert=format
Convert CHM file into specified file format. If destination
file is omitted, the output filename will be created based on
the name of the CHM file. Available formats::
html - Single HTML file
text - Plain text file (uses lynx(1) or elinks(1))
pdf - Adobe PDF
-d, --dump
Dump HTML data from CHM file to standard output.
-V, --version
Print version number and exit.
-h, --help
Print help message and exit.
COPYRIGHT
=========
See the file /usr/doc/archmage-|version|/COPYING for license information.
AUTHORS
=======
archmage was written by dottedmag.
This man page written for the SlackBuilds.org project
by B. Watson, and is licensed under the WTFPL.
SEE ALSO
========
The archmage homepage: https://github.com/dottedmag/archmage

View file

@ -0,0 +1,19 @@
# HOW TO EDIT THIS FILE:
# The "handy ruler" below makes it easier to edit a package description.
# Line up the first '|' above the ':' following the base package name, and
# the '|' on the right side marks the last column you can put a character in.
# You must make exactly 11 lines for the formatting to be correct. It's also
# customary to leave one space after the ':' except on otherwise blank lines.
|-----handy-ruler------------------------------------------------------|
archmage: archmage (CHM to HTML/PDF/text converter)
archmage:
archmage: arCHMage converts CHM files to HTML, plain text, or PDF. CHM is the
archmage: format used by Microsoft HTML Help, also known as Compiled HTML.
archmage:
archmage:
archmage:
archmage:
archmage:
archmage:
archmage: