From 004c61c7bdfbd12d9a40803752dc5b85f55903fb Mon Sep 17 00:00:00 2001 From: Hunter Sezen Date: Wed, 9 Dec 2015 01:25:33 +0700 Subject: [PATCH] libraries/libexttextcat: Added (Text Categorization library). Signed-off-by: Willy Sudiarto Raharjo --- libraries/libexttextcat/README | 20 ++++ .../libexttextcat/libexttextcat.SlackBuild | 93 +++++++++++++++++++ libraries/libexttextcat/libexttextcat.info | 10 ++ libraries/libexttextcat/slack-desc | 19 ++++ 4 files changed, 142 insertions(+) create mode 100644 libraries/libexttextcat/README create mode 100644 libraries/libexttextcat/libexttextcat.SlackBuild create mode 100644 libraries/libexttextcat/libexttextcat.info create mode 100644 libraries/libexttextcat/slack-desc diff --git a/libraries/libexttextcat/README b/libraries/libexttextcat/README new file mode 100644 index 0000000000..3b9743c04a --- /dev/null +++ b/libraries/libexttextcat/README @@ -0,0 +1,20 @@ +Libtextcat is a library with functions that implement the +classification technique described in Cavnar & Trenkle, "N-Gram-Based +Text Categorization". It was primarily developed for language +guessing, a task on which it is known to perform with near-perfect +accuracy. + +The central idea of the Cavnar & Trenkle technique is to calculate a +"fingerprint" of a document with an unknown category, and compare this +with the fingerprints of a number of documents of which the categories +are known. The categories of the closest matches are output as the +classification. A fingerprint is a list of the most frequent n-grams +occurring in a document, ordered by frequency. Fingerprints are +compared with a simple out-of-place metric. See the article for more +details. + +Considerable effort went into making this implementation fast and +efficient. The language guesser processes over 100 documents/second on +a simple PC, which makes it practical for many uses. It was developed +for use in our webcrawler and search engine software, in which it it +handles millions of documents a day. diff --git a/libraries/libexttextcat/libexttextcat.SlackBuild b/libraries/libexttextcat/libexttextcat.SlackBuild new file mode 100644 index 0000000000..5660f3930c --- /dev/null +++ b/libraries/libexttextcat/libexttextcat.SlackBuild @@ -0,0 +1,93 @@ +#!/bin/sh + +# Slackware build script for libexttextcat + +# Copyright 2015 Hunter Sezen California, USA +# All rights reserved. +# +# Redistribution and use of this script, with or without modification, is +# permitted provided that the following conditions are met: +# +# 1. Redistributions of this script must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +PRGNAM=libexttextcat +VERSION=${VERSION:-3.4.4} +BUILD=${BUILD:-1} +TAG=${TAG:-_SBo} + +if [ -z "$ARCH" ]; then + case "$( uname -m )" in + i?86) ARCH=i486 ;; + arm*) ARCH=arm ;; + *) ARCH=$( uname -m ) ;; + esac +fi + +CWD=$(pwd) +TMP=${TMP:-/tmp/SBo} +PKG=$TMP/package-$PRGNAM +OUTPUT=${OUTPUT:-/tmp} + +if [ "$ARCH" = "i486" ]; then + SLKCFLAGS="-O2 -march=i486 -mtune=i686" + LIBDIRSUFFIX="" +elif [ "$ARCH" = "i686" ]; then + SLKCFLAGS="-O2 -march=i686 -mtune=i686" + LIBDIRSUFFIX="" +elif [ "$ARCH" = "x86_64" ]; then + SLKCFLAGS="-O2 -fPIC" + LIBDIRSUFFIX="64" +else + SLKCFLAGS="-O2" + LIBDIRSUFFIX="" +fi + +set -e + +rm -rf $PKG +mkdir -p $TMP $PKG $OUTPUT +cd $TMP +rm -rf $PRGNAM-$VERSION +tar xvf $CWD/$PRGNAM-$VERSION.tar.xz +cd $PRGNAM-$VERSION +chown -R root:root . +find -L . \ + \( -perm 777 -o -perm 775 -o -perm 750 -o -perm 711 -o -perm 555 \ + -o -perm 511 \) -exec chmod 755 {} \; -o \ + \( -perm 666 -o -perm 664 -o -perm 640 -o -perm 600 -o -perm 444 \ + -o -perm 440 -o -perm 400 \) -exec chmod 644 {} \; + +CFLAGS="$SLKCFLAGS" \ +CXXFLAGS="$SLKCFLAGS" \ +./configure \ + --prefix=/usr \ + --libdir=/usr/lib${LIBDIRSUFFIX} \ + --build=$ARCH-slackware-linux + +make +make install DESTDIR=$PKG + +find $PKG -print0 | xargs -0 file | grep -e "executable" -e "shared object" | grep ELF \ + | cut -f 1 -d : | xargs strip --strip-unneeded 2> /dev/null || true + +mkdir -p $PKG/usr/doc/$PRGNAM-$VERSION +cp -a ChangeLog LICENSE READM* TODO $PKG/usr/doc/$PRGNAM-$VERSION +cat $CWD/$PRGNAM.SlackBuild > $PKG/usr/doc/$PRGNAM-$VERSION/$PRGNAM.SlackBuild + +mkdir -p $PKG/install +cat $CWD/slack-desc > $PKG/install/slack-desc + +cd $PKG +/sbin/makepkg -l y -c n $OUTPUT/$PRGNAM-$VERSION-$ARCH-$BUILD$TAG.${PKGTYPE:-tgz} diff --git a/libraries/libexttextcat/libexttextcat.info b/libraries/libexttextcat/libexttextcat.info new file mode 100644 index 0000000000..cf4e410507 --- /dev/null +++ b/libraries/libexttextcat/libexttextcat.info @@ -0,0 +1,10 @@ +PRGNAM="libexttextcat" +VERSION="3.4.4" +HOMEPAGE="https://wiki.freedesktop.org/www/Software/libexttextcat/" +DOWNLOAD="http://dev-www.libreoffice.org/src/libexttextcat/libexttextcat-3.4.4.tar.xz" +MD5SUM="bfa7107c27afda3a3afa4b7ab5a3fe17" +DOWNLOAD_x86_64="" +MD5SUM_x86_64="" +REQUIRES="" +MAINTAINER="Hunter Sezen" +EMAIL="ovariegata@yahoo.com" diff --git a/libraries/libexttextcat/slack-desc b/libraries/libexttextcat/slack-desc new file mode 100644 index 0000000000..4799cd6380 --- /dev/null +++ b/libraries/libexttextcat/slack-desc @@ -0,0 +1,19 @@ +# HOW TO EDIT THIS FILE: +# The "handy ruler" below makes it easier to edit a package description. +# Line up the first '|' above the ':' following the base package name, and +# the '|' on the right side marks the last column you can put a character in. +# You must make exactly 11 lines for the formatting to be correct. It's also +# customary to leave one space after the ':' except on otherwise blank lines. + + |-----handy-ruler------------------------------------------------------| +libexttextcat: libexttextcat (N-Gram-Based Text Categorization library) +libexttextcat: +libexttextcat: Libtextcat is a library with functions that implement the +libexttextcat: classification technique described in Cavnar & Trenkle, "N-Gram-Based +libexttextcat: Text Categorization". It was primarily developed for language +libexttextcat: guessing, a task on which it is known to perform with near-perfect +libexttextcat: accuracy. +libexttextcat: +libexttextcat: Homepage: https://wiki.freedesktop.org/www/Software/libexttextcat/ +libexttextcat: +libexttextcat: