{
  "_id": "6a1f18feb401979e7341ee3b",
  "Package": "boilerpipeR",
  "Version": "1.3.2",
  "Date": "2021-05-19",
  "Title": "Interface to the Boilerpipe Java Library",
  "Author": "See AUTHORS file.",
  "Maintainer": "Mario Annau <mario.annau@gmail.com>",
  "Description": "Generic Extraction of main text content from HTML files;\nremoval of ads, sidebars and headers using the boilerpipe\n<https://github.com/kohlschutter/boilerpipe> Java library. The\nextraction heuristics from boilerpipe show a robust performance\nfor a wide range of web site templates.",
  "License": "Apache License (== 2.0)",
  "URL": "https://github.com/mannau/boilerpipeR",
  "BugReports": "https://github.com/mannau/boilerpipeR/issues",
  "RoxygenNote": "7.1.1",
  "Encoding": "UTF-8",
  "Config/pak/sysreqs": "make default-jdk",
  "Repository": "https://mannau.r-universe.dev",
  "Date/Publication": "2021-05-19 09:05:01 UTC",
  "RemoteUrl": "https://github.com/mannau/boilerpiper",
  "RemoteRef": "HEAD",
  "RemoteSha": "5cbc092cac7717a776bef1b54d3021959c4bcc7b",
  "NeedsCompilation": "no",
  "Packaged": {
    "Date": "2026-05-17 05:31:25 UTC",
    "User": "root"
  },
  "MD5sum": "f44902aa1543db20e1beed6c8c22cd60",
  "_user": "mannau",
  "_type": "src",
  "_file": "boilerpipeR_1.3.2.tar.gz",
  "_fileid": "f31fb749787c84040df31713674a3c05bfed7818cf77c6bdab94d0f61b9be970",
  "_filesize": 1674216,
  "_sha256": "f31fb749787c84040df31713674a3c05bfed7818cf77c6bdab94d0f61b9be970",
  "_created": "2026-05-17T05:31:25.000Z",
  "_published": "2026-06-02T17:55:10.402Z",
  "_distro": "noble",
  "_jobs": [
    {
      "job": 79137029449,
      "time": 123,
      "config": "linux-devel-x86_64",
      "r": "4.7.0",
      "check": "NOTE",
      "artifact": "7039644338"
    },
    {
      "job": 79137029986,
      "time": 112,
      "config": "linux-release-x86_64",
      "r": "4.6.0",
      "check": "NOTE",
      "artifact": "7039643327"
    },
    {
      "job": 79137029421,
      "time": 150,
      "config": "macos-oldrel-arm64",
      "r": "4.5.3",
      "check": "NOTE",
      "artifact": "7039644216"
    },
    {
      "job": 79137029720,
      "time": 148,
      "config": "macos-release-arm64",
      "r": "4.6.0",
      "check": "NOTE",
      "artifact": "7039644224"
    },
    {
      "job": 79137028480,
      "time": 174,
      "config": "source",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7039632267"
    },
    {
      "job": 79137028709,
      "time": 100,
      "config": "wasm-release",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7365708867"
    },
    {
      "job": 79137029467,
      "time": 80,
      "config": "windows-devel",
      "r": "4.7.0",
      "check": "NOTE",
      "artifact": "7039640314"
    },
    {
      "job": 79137029642,
      "time": 60,
      "config": "windows-oldrel",
      "r": "4.5.3",
      "check": "NOTE",
      "artifact": "7039638463"
    },
    {
      "job": 79137029416,
      "time": 91,
      "config": "windows-release",
      "r": "4.6.0",
      "check": "NOTE",
      "artifact": "7039641310"
    }
  ],
  "_buildurl": "https://github.com/r-universe/mannau/actions/runs/25982384609",
  "_status": "success",
  "_host": "GitHub-Actions",
  "_upstream": "https://github.com/mannau/boilerpiper",
  "_commit": {
    "id": "5cbc092cac7717a776bef1b54d3021959c4bcc7b",
    "author": "mannau <mario.annau@quantargo.com>",
    "committer": "mannau <mario.annau@quantargo.com>",
    "message": "Add angle brackets to DESCRIPTION URL\n",
    "time": 1621415101
  },
  "_maintainer": {
    "name": "Mario Annau",
    "email": "mario.annau@gmail.com",
    "login": "mannau",
    "description": "CEO and Founder of Quantargo, certified trainer and consultant working on data science projects.",
    "uuid": 1262952
  },
  "_registered": true,
  "_dependencies": [
    {
      "package": "rJava",
      "role": "Imports"
    },
    {
      "package": "RCurl",
      "role": "Suggests"
    }
  ],
  "_owner": "mannau",
  "_selfowned": true,
  "_usedby": 0,
  "_updates": [],
  "_tags": [],
  "_stars": 21,
  "_contributors": [
    {
      "user": "mannau",
      "count": 38,
      "uuid": 1262952
    }
  ],
  "_userbio": {
    "uuid": 1262952,
    "type": "user",
    "name": "Mario Annau",
    "description": "CEO and Founder of Quantargo, certified trainer and consultant working on data science projects."
  },
  "_downloads": {
    "count": 262,
    "source": "https://cranlogs.r-pkg.org/downloads/total/last-month/boilerpipeR"
  },
  "_devurl": "https://github.com/mannau/boilerpiper",
  "_searchresults": 30,
  "_topics": [
    "openjdk"
  ],
  "_rbuild": "4.6.0",
  "_assets": [
    "extra/boilerpipeR.html",
    "extra/citation.cff",
    "extra/citation.html",
    "extra/citation.json",
    "extra/citation.txt",
    "extra/contents.json",
    "extra/NEWS.html",
    "extra/NEWS.txt",
    "extra/readme.html",
    "extra/readme.md",
    "manual.pdf"
  ],
  "_homeurl": "https://github.com/mannau/boilerpiper",
  "_realowner": "mannau",
  "_cranurl": true,
  "_releases": [
    {
      "version": "1.0",
      "date": "2012-12-21"
    },
    {
      "version": "1.1",
      "date": "2014-01-11"
    },
    {
      "version": "1.2",
      "date": "2014-05-12"
    },
    {
      "version": "1.2.2",
      "date": "2014-08-21"
    },
    {
      "version": "1.3",
      "date": "2015-05-11"
    },
    {
      "version": "1.3.2",
      "date": "2021-05-19"
    }
  ],
  "_exports": [
    "ArticleExtractor",
    "ArticleSentencesExtractor",
    "CanolaExtractor",
    "DefaultExtractor",
    "Extractor",
    "KeepEverythingExtractor",
    "LargestContentExtractor",
    "NumWordsRulesExtractor"
  ],
  "_datasets": [
    {
      "name": "content",
      "title": "Wordpress generated Webpage (retrieved from Quantivity Blog <https://quantivity.wordpress.com>). Content is saved as character and ready to be extracted.",
      "object": "content",
      "file": "content.rda",
      "class": [
        "character"
      ],
      "fields": [],
      "table": false,
      "tojson": true
    }
  ],
  "_help": [
    {
      "page": "boilerpipeR-package",
      "title": "Extract the main content from HTML files",
      "topics": [
        "boilerpipeR-package",
        "boilerpipe"
      ]
    },
    {
      "page": "ArticleExtractor",
      "title": "A full-text extractor which is tuned towards news articles.",
      "topics": [
        "ArticleExtractor"
      ]
    },
    {
      "page": "ArticleSentencesExtractor",
      "title": "A full-text extractor which is tuned towards extracting sentences from news articles.",
      "topics": [
        "ArticleSentencesExtractor"
      ]
    },
    {
      "page": "CanolaExtractor",
      "title": "A full-text extractor trained on a 'krdwrd' Canola (see 'https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf'.",
      "topics": [
        "CanolaExtractor"
      ]
    },
    {
      "page": "content",
      "title": "Wordpress generated Webpage (retrieved from Quantivity Blog <https://quantivity.wordpress.com>). Content is saved as character and ready to be extracted.",
      "topics": [
        "content"
      ]
    },
    {
      "page": "DefaultExtractor",
      "title": "A quite generic full-text extractor.",
      "topics": [
        "DefaultExtractor"
      ]
    },
    {
      "page": "Extractor",
      "title": "Generic extraction function which calls boilerpipe extractors",
      "topics": [
        "Extractor"
      ]
    },
    {
      "page": "KeepEverythingExtractor",
      "title": "Marks everything as content.",
      "topics": [
        "KeepEverythingExtractor"
      ]
    },
    {
      "page": "LargestContentExtractor",
      "title": "A full-text extractor which extracts the largest text component of a page.",
      "topics": [
        "LargestContentExtractor"
      ]
    },
    {
      "page": "NumWordsRulesExtractor",
      "title": "A quite generic full-text extractor solely based upon the number of words per block (the current, the previous and the next block).",
      "topics": [
        "NumWordsRulesExtractor"
      ]
    }
  ],
  "_readme": "https://github.com/mannau/boilerpiper/raw/HEAD/README.md",
  "_rundeps": [
    "rJava"
  ],
  "_sysdeps": [
    {
      "shlib": "libjvm",
      "package": "openjdk-21-jre-headless",
      "headers": "openjdk-21-jre-headless",
      "source": "openjdk",
      "version": "21.0.10+7-1~24.04",
      "name": "openjdk",
      "homepage": "https://openjdk.java.net/",
      "description": "OpenJDK Java runtime, using Hotspot JIT (headless)"
    }
  ],
  "_vignettes": [
    {
      "source": "ShortIntro.Rnw",
      "filename": "ShortIntro.pdf",
      "title": "Introduction to the tm.plugin.webmining Package",
      "engine": "utils::Sweave",
      "headings": [],
      "created": "2013-12-31 12:37:32",
      "modified": "2021-05-19 07:49:05",
      "commits": 4
    }
  ],
  "_score": 5.498310553789601,
  "_indexed": true,
  "_nocasepkg": "boilerpiper",
  "_universes": [
    "mannau"
  ],
  "_binaries": [
    {
      "r": "4.7.0",
      "os": "linux",
      "version": "1.3.2",
      "date": "2026-05-17T05:33:29.000Z",
      "distro": "noble",
      "commit": "5cbc092cac7717a776bef1b54d3021959c4bcc7b",
      "fileid": "267bd3125deebc84df94c195d176f69858edb5adf5ff39929689192eedd0b829",
      "status": "success",
      "check": "NOTE",
      "buildurl": "https://github.com/r-universe/mannau/actions/runs/25982384609"
    },
    {
      "r": "4.6.0",
      "os": "linux",
      "version": "1.3.2",
      "date": "2026-05-17T05:33:18.000Z",
      "distro": "noble",
      "commit": "5cbc092cac7717a776bef1b54d3021959c4bcc7b",
      "fileid": "20a58e10f3a473f62c4400049b8a73dfb73ca139e68ede429e53cb4e43451f7f",
      "status": "success",
      "check": "NOTE",
      "buildurl": "https://github.com/r-universe/mannau/actions/runs/25982384609"
    },
    {
      "r": "4.5.3",
      "os": "mac",
      "version": "1.3.2",
      "date": "2026-05-17T05:33:35.000Z",
      "commit": "5cbc092cac7717a776bef1b54d3021959c4bcc7b",
      "fileid": "51c174e5abef6d264757b125565612c0303b58cb041b70b74c8e8d57d8871801",
      "status": "success",
      "check": "NOTE",
      "buildurl": "https://github.com/r-universe/mannau/actions/runs/25982384609"
    },
    {
      "r": "4.6.0",
      "os": "mac",
      "version": "1.3.2",
      "date": "2026-05-17T05:33:30.000Z",
      "commit": "5cbc092cac7717a776bef1b54d3021959c4bcc7b",
      "fileid": "ead9d043b89ecb9c5ea5cb893383111659e9e39c2b9765a7858ff26c22920e37",
      "status": "success",
      "check": "NOTE",
      "buildurl": "https://github.com/r-universe/mannau/actions/runs/25982384609"
    },
    {
      "r": "4.7.0",
      "os": "win",
      "version": "1.3.2",
      "date": "2026-05-17T05:32:36.000Z",
      "commit": "5cbc092cac7717a776bef1b54d3021959c4bcc7b",
      "fileid": "459bc60c93825bddbdefdb0e2edc0dedb7e0a5410166022bccda501b2a82867d",
      "status": "success",
      "check": "NOTE",
      "buildurl": "https://github.com/r-universe/mannau/actions/runs/25982384609"
    },
    {
      "r": "4.5.3",
      "os": "win",
      "version": "1.3.2",
      "date": "2026-05-17T05:32:24.000Z",
      "commit": "5cbc092cac7717a776bef1b54d3021959c4bcc7b",
      "fileid": "dbe1bc43aa18d567f89e1bd671a9cc7db08f70badd24e7d4fd06961d4e5f37a5",
      "status": "success",
      "check": "NOTE",
      "buildurl": "https://github.com/r-universe/mannau/actions/runs/25982384609"
    },
    {
      "r": "4.6.0",
      "os": "win",
      "version": "1.3.2",
      "date": "2026-05-17T05:32:29.000Z",
      "commit": "5cbc092cac7717a776bef1b54d3021959c4bcc7b",
      "fileid": "119e05b96eacad83623c610ffebe7223ed304810d3733757403714c44e121281",
      "status": "success",
      "check": "NOTE",
      "buildurl": "https://github.com/r-universe/mannau/actions/runs/25982384609"
    },
    {
      "r": "4.6.0",
      "os": "wasm",
      "version": "1.3.2",
      "date": "2026-06-02T17:54:49.000Z",
      "commit": "5cbc092cac7717a776bef1b54d3021959c4bcc7b",
      "fileid": "f73f448f4001b0ab4208d3dd3273c9b91b6132973c17acc62d2d6c3bdbda6b9a",
      "status": "success",
      "buildurl": "https://github.com/r-universe/mannau/actions/runs/25982384609"
    }
  ]
}