diff -Nru beautifulsoup4-4.10.0/appveyor.yml beautifulsoup4-1.7.1-benchmark/appveyor.yml --- beautifulsoup4-4.10.0/appveyor.yml 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/appveyor.yml 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,50 @@ +version: '{build}' + +image: Visual Studio 2017 + +configuration: + - Debug + - Release + +environment: + matrix: + - compiler: msvc-15-seh + generator: "Visual Studio 15 2017" + + - compiler: msvc-15-seh + generator: "Visual Studio 15 2017 Win64" + + - compiler: msvc-14-seh + generator: "Visual Studio 14 2015" + + - compiler: msvc-14-seh + generator: "Visual Studio 14 2015 Win64" + + - compiler: gcc-5.3.0-posix + generator: "MinGW Makefiles" + cxx_path: 'C:\mingw-w64\i686-5.3.0-posix-dwarf-rt_v4-rev0\mingw32\bin' + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + +matrix: + fast_finish: true + +install: + # git bash conflicts with MinGW makefiles + - if "%generator%"=="MinGW Makefiles" (set "PATH=%PATH:C:\Program Files\Git\usr\bin;=%") + - if not "%cxx_path%"=="" (set "PATH=%PATH%;%cxx_path%") + +build_script: + - md _build -Force + - cd _build + - echo %configuration% + - cmake -G "%generator%" "-DCMAKE_BUILD_TYPE=%configuration%" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON .. + - cmake --build . --config %configuration% + +test_script: + - ctest --build-config %configuration% --timeout 300 --output-on-failure + +artifacts: + - path: '_build/CMakeFiles/*.log' + name: logs + - path: '_build/Testing/**/*.xml' + name: test_results diff -Nru beautifulsoup4-4.10.0/AUTHORS beautifulsoup4-1.7.1-benchmark/AUTHORS --- beautifulsoup4-4.10.0/AUTHORS 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/AUTHORS 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,67 @@ +# This is the official list of benchmark authors for copyright purposes. +# This file is distinct from the CONTRIBUTORS files. +# See the latter for an explanation. +# +# Names should be added to this file as: +# Name or Organization +# The email address is not required for organizations. +# +# Please keep the list sorted. + +Albert Pretorius +Alex Steele +Andriy Berestovskyy +Arne Beer +Carto +Cezary Skrzyński +Christian Wassermann +Christopher Seymour +Colin Braley +Daniel Harvey +David Coeurjolly +Deniz Evrenci +Dirac Research +Dominik Czarnota +Dominik Korman +Donald Aingworth +Eric Backus +Eric Fiselier +Eugene Zhuk +Evgeny Safronov +Federico Ficarelli +Felix Homann +Gergő Szitár +Google Inc. +International Business Machines Corporation +Ismael Jimenez Martinez +Jern-Kuan Leong +JianXiong Zhou +Joao Paulo Magalhaes +Jordan Williams +Jussi Knuuttila +Kaito Udagawa +Kishan Kumar +Lei Xu +Matt Clarkson +Maxim Vafin +MongoDB Inc. +Nick Hutchinson +Norman Heino +Oleksandr Sochka +Ori Livneh +Paul Redmond +Raghu Raja +Radoslav Yovchev +Rainer Orth +Roman Lebedev +Sayan Bhattacharjee +Shapr3D +Shuo Chen +Staffan Tjernstrom +Steinar H. Gunderson +Stripe, Inc. +Tobias Schmidt +Yixuan Qiu +Yusuke Suzuki +Zbigniew Skowron +Min-Yih Hsu diff -Nru beautifulsoup4-4.10.0/beautifulsoup4.egg-info/dependency_links.txt beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/dependency_links.txt --- beautifulsoup4-4.10.0/beautifulsoup4.egg-info/dependency_links.txt 2021-09-08 00:13:24.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/dependency_links.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ - diff -Nru beautifulsoup4-4.10.0/beautifulsoup4.egg-info/PKG-INFO beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/PKG-INFO --- beautifulsoup4-4.10.0/beautifulsoup4.egg-info/PKG-INFO 2021-09-08 00:13:24.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/PKG-INFO 1970-01-01 00:00:00.000000000 +0000 @@ -1,115 +0,0 @@ -Metadata-Version: 2.1 -Name: beautifulsoup4 -Version: 4.10.0 -Summary: Screen-scraping library -Home-page: http://www.crummy.com/software/BeautifulSoup/bs4/ -Author: Leonard Richardson -Author-email: leonardr@segfault.org -License: MIT -Download-URL: http://www.crummy.com/software/BeautifulSoup/bs4/download/ -Description: Beautiful Soup is a library that makes it easy to scrape information - from web pages. It sits atop an HTML or XML parser, providing Pythonic - idioms for iterating, searching, and modifying the parse tree. - - # Quick start - - ``` - >>> from bs4 import BeautifulSoup - >>> soup = BeautifulSoup("

SomebadHTML") - >>> print(soup.prettify()) - - -

- Some - - bad - - HTML - - -

- - - >>> soup.find(text="bad") - 'bad' - >>> soup.i - HTML - # - >>> soup = BeautifulSoup("SomebadXML", "xml") - # - >>> print(soup.prettify()) - - - Some - - bad - - XML - - - ``` - - To go beyond the basics, [comprehensive documentation is available](http://www.crummy.com/software/BeautifulSoup/bs4/doc/). - - # Links - - * [Homepage](http://www.crummy.com/software/BeautifulSoup/bs4/) - * [Documentation](http://www.crummy.com/software/BeautifulSoup/bs4/doc/) - * [Discussion group](http://groups.google.com/group/beautifulsoup/) - * [Development](https://code.launchpad.net/beautifulsoup/) - * [Bug tracker](https://bugs.launchpad.net/beautifulsoup/) - * [Complete changelog](https://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/view/head:/CHANGELOG) - - # Note on Python 2 sunsetting - - Beautiful Soup's support for Python 2 was discontinued on December 31, - 2020: one year after the sunset date for Python 2 itself. From this - point onward, new Beautiful Soup development will exclusively target - Python 3. The final release of Beautiful Soup 4 to support Python 2 - was 4.9.3. - - # Supporting the project - - If you use Beautiful Soup as part of your professional work, please consider a - [Tidelift subscription](https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=readme). - This will support many of the free software projects your organization - depends on, not just Beautiful Soup. - - If you use Beautiful Soup for personal projects, the best way to say - thank you is to read - [Tool Safety](https://www.crummy.com/software/BeautifulSoup/zine/), a zine I - wrote about what Beautiful Soup has taught me about software - development. - - # Building the documentation - - The bs4/doc/ directory contains full documentation in Sphinx - format. Run `make html` in that directory to create HTML - documentation. - - # Running the unit tests - - Beautiful Soup supports unit test discovery from the project root directory: - - ``` - $ nosetests - ``` - - ``` - $ python3 -m unittest discover -s bs4 - ``` - -Platform: UNKNOWN -Classifier: Development Status :: 5 - Production/Stable -Classifier: Intended Audience :: Developers -Classifier: License :: OSI Approved :: MIT License -Classifier: Programming Language :: Python -Classifier: Programming Language :: Python :: 3 -Classifier: Topic :: Text Processing :: Markup :: HTML -Classifier: Topic :: Text Processing :: Markup :: XML -Classifier: Topic :: Text Processing :: Markup :: SGML -Classifier: Topic :: Software Development :: Libraries :: Python Modules -Requires-Python: >3.0.0 -Description-Content-Type: text/markdown -Provides-Extra: html5lib -Provides-Extra: lxml diff -Nru beautifulsoup4-4.10.0/beautifulsoup4.egg-info/requires.txt beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/requires.txt --- beautifulsoup4-4.10.0/beautifulsoup4.egg-info/requires.txt 2021-09-08 00:13:24.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/requires.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -soupsieve>1.2 - -[html5lib] -html5lib - -[lxml] -lxml diff -Nru beautifulsoup4-4.10.0/beautifulsoup4.egg-info/SOURCES.txt beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/SOURCES.txt --- beautifulsoup4-4.10.0/beautifulsoup4.egg-info/SOURCES.txt 2021-09-08 00:13:24.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/SOURCES.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,53 +0,0 @@ -COPYING.txt -LICENSE -MANIFEST.in -NEWS.txt -README.md -TODO.txt -parse.txt -setup.cfg -setup.py -test-all-versions -beautifulsoup4.egg-info/PKG-INFO -beautifulsoup4.egg-info/SOURCES.txt -beautifulsoup4.egg-info/dependency_links.txt -beautifulsoup4.egg-info/requires.txt -beautifulsoup4.egg-info/top_level.txt -bs4/__init__.py -bs4/dammit.py -bs4/diagnose.py -bs4/element.py -bs4/formatter.py -bs4/testing.py -bs4/builder/__init__.py -bs4/builder/_html5lib.py -bs4/builder/_htmlparser.py -bs4/builder/_lxml.py -bs4/tests/__init__.py -bs4/tests/test_builder_registry.py -bs4/tests/test_docs.py -bs4/tests/test_html5lib.py -bs4/tests/test_htmlparser.py -bs4/tests/test_lxml.py -bs4/tests/test_soup.py -bs4/tests/test_tree.py -doc/Makefile -doc.ptbr/Makefile -doc.ptbr/source/6.1.jpg -doc.ptbr/source/conf.py -doc.ptbr/source/index.rst -doc.ru/Makefile -doc.ru/source/6.1.jpg -doc.ru/source/bs4ru.rst -doc.ru/source/conf.py -doc.ru/source/index.rst -doc.zh/Makefile -doc.zh/source/6.1.jpg -doc.zh/source/conf.py -doc.zh/source/index.rst -doc/source/6.1.jpg -doc/source/check_doc.py -doc/source/conf.py -doc/source/index.rst -scripts/demonstrate_parser_differences.py -scripts/demonstration_markup.txt \ No newline at end of file diff -Nru beautifulsoup4-4.10.0/beautifulsoup4.egg-info/top_level.txt beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/top_level.txt --- beautifulsoup4-4.10.0/beautifulsoup4.egg-info/top_level.txt 2021-09-08 00:13:24.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/top_level.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -bs4 diff -Nru beautifulsoup4-4.10.0/bindings/python/BUILD beautifulsoup4-1.7.1-benchmark/bindings/python/BUILD --- beautifulsoup4-4.10.0/bindings/python/BUILD 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/BUILD 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,3 @@ +exports_files(glob(["*.BUILD"])) +exports_files(["build_defs.bzl"]) + diff -Nru beautifulsoup4-4.10.0/bindings/python/build_defs.bzl beautifulsoup4-1.7.1-benchmark/bindings/python/build_defs.bzl --- beautifulsoup4-4.10.0/bindings/python/build_defs.bzl 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/build_defs.bzl 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,25 @@ +_SHARED_LIB_SUFFIX = { + "//conditions:default": ".so", + "//:windows": ".dll", +} + +def py_extension(name, srcs, hdrs = [], copts = [], features = [], deps = []): + for shared_lib_suffix in _SHARED_LIB_SUFFIX.values(): + shared_lib_name = name + shared_lib_suffix + native.cc_binary( + name = shared_lib_name, + linkshared = True, + linkstatic = True, + srcs = srcs + hdrs, + copts = copts, + features = features, + deps = deps, + ) + + return native.py_library( + name = name, + data = select({ + platform: [name + shared_lib_suffix] + for platform, shared_lib_suffix in _SHARED_LIB_SUFFIX.items() + }), + ) diff -Nru beautifulsoup4-4.10.0/bindings/python/google_benchmark/benchmark.cc beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/benchmark.cc --- beautifulsoup4-4.10.0/bindings/python/google_benchmark/benchmark.cc 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/benchmark.cc 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,184 @@ +// Benchmark for Python. + +#include "benchmark/benchmark.h" + +#include +#include +#include + +#include "pybind11/operators.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" +#include "pybind11/stl_bind.h" + +PYBIND11_MAKE_OPAQUE(benchmark::UserCounters); + +namespace { +namespace py = ::pybind11; + +std::vector Initialize(const std::vector& argv) { + // The `argv` pointers here become invalid when this function returns, but + // benchmark holds the pointer to `argv[0]`. We create a static copy of it + // so it persists, and replace the pointer below. + static std::string executable_name(argv[0]); + std::vector ptrs; + ptrs.reserve(argv.size()); + for (auto& arg : argv) { + ptrs.push_back(const_cast(arg.c_str())); + } + ptrs[0] = const_cast(executable_name.c_str()); + int argc = static_cast(argv.size()); + benchmark::Initialize(&argc, ptrs.data()); + std::vector remaining_argv; + remaining_argv.reserve(argc); + for (int i = 0; i < argc; ++i) { + remaining_argv.emplace_back(ptrs[i]); + } + return remaining_argv; +} + +benchmark::internal::Benchmark* RegisterBenchmark(const char* name, + py::function f) { + return benchmark::RegisterBenchmark( + name, [f](benchmark::State& state) { f(&state); }); +} + +PYBIND11_MODULE(_benchmark, m) { + using benchmark::TimeUnit; + py::enum_(m, "TimeUnit") + .value("kNanosecond", TimeUnit::kNanosecond) + .value("kMicrosecond", TimeUnit::kMicrosecond) + .value("kMillisecond", TimeUnit::kMillisecond) + .value("kSecond", TimeUnit::kSecond) + .export_values(); + + using benchmark::BigO; + py::enum_(m, "BigO") + .value("oNone", BigO::oNone) + .value("o1", BigO::o1) + .value("oN", BigO::oN) + .value("oNSquared", BigO::oNSquared) + .value("oNCubed", BigO::oNCubed) + .value("oLogN", BigO::oLogN) + .value("oNLogN", BigO::oLogN) + .value("oAuto", BigO::oAuto) + .value("oLambda", BigO::oLambda) + .export_values(); + + using benchmark::internal::Benchmark; + py::class_(m, "Benchmark") + // For methods returning a pointer tor the current object, reference + // return policy is used to ask pybind not to take ownership oof the + // returned object and avoid calling delete on it. + // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies + // + // For methods taking a const std::vector<...>&, a copy is created + // because a it is bound to a Python list. + // https://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html + .def("unit", &Benchmark::Unit, py::return_value_policy::reference) + .def("arg", &Benchmark::Arg, py::return_value_policy::reference) + .def("args", &Benchmark::Args, py::return_value_policy::reference) + .def("range", &Benchmark::Range, py::return_value_policy::reference, + py::arg("start"), py::arg("limit")) + .def("dense_range", &Benchmark::DenseRange, + py::return_value_policy::reference, py::arg("start"), + py::arg("limit"), py::arg("step") = 1) + .def("ranges", &Benchmark::Ranges, py::return_value_policy::reference) + .def("args_product", &Benchmark::ArgsProduct, + py::return_value_policy::reference) + .def("arg_name", &Benchmark::ArgName, py::return_value_policy::reference) + .def("arg_names", &Benchmark::ArgNames, + py::return_value_policy::reference) + .def("range_pair", &Benchmark::RangePair, + py::return_value_policy::reference, py::arg("lo1"), py::arg("hi1"), + py::arg("lo2"), py::arg("hi2")) + .def("range_multiplier", &Benchmark::RangeMultiplier, + py::return_value_policy::reference) + .def("min_time", &Benchmark::MinTime, py::return_value_policy::reference) + .def("min_warmup_time", &Benchmark::MinWarmUpTime, + py::return_value_policy::reference) + .def("iterations", &Benchmark::Iterations, + py::return_value_policy::reference) + .def("repetitions", &Benchmark::Repetitions, + py::return_value_policy::reference) + .def("report_aggregates_only", &Benchmark::ReportAggregatesOnly, + py::return_value_policy::reference, py::arg("value") = true) + .def("display_aggregates_only", &Benchmark::DisplayAggregatesOnly, + py::return_value_policy::reference, py::arg("value") = true) + .def("measure_process_cpu_time", &Benchmark::MeasureProcessCPUTime, + py::return_value_policy::reference) + .def("use_real_time", &Benchmark::UseRealTime, + py::return_value_policy::reference) + .def("use_manual_time", &Benchmark::UseManualTime, + py::return_value_policy::reference) + .def( + "complexity", + (Benchmark * (Benchmark::*)(benchmark::BigO)) & Benchmark::Complexity, + py::return_value_policy::reference, + py::arg("complexity") = benchmark::oAuto); + + using benchmark::Counter; + py::class_ py_counter(m, "Counter"); + + py::enum_(py_counter, "Flags") + .value("kDefaults", Counter::Flags::kDefaults) + .value("kIsRate", Counter::Flags::kIsRate) + .value("kAvgThreads", Counter::Flags::kAvgThreads) + .value("kAvgThreadsRate", Counter::Flags::kAvgThreadsRate) + .value("kIsIterationInvariant", Counter::Flags::kIsIterationInvariant) + .value("kIsIterationInvariantRate", + Counter::Flags::kIsIterationInvariantRate) + .value("kAvgIterations", Counter::Flags::kAvgIterations) + .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate) + .value("kInvert", Counter::Flags::kInvert) + .export_values() + .def(py::self | py::self); + + py::enum_(py_counter, "OneK") + .value("kIs1000", Counter::OneK::kIs1000) + .value("kIs1024", Counter::OneK::kIs1024) + .export_values(); + + py_counter + .def(py::init(), + py::arg("value") = 0., py::arg("flags") = Counter::kDefaults, + py::arg("k") = Counter::kIs1000) + .def(py::init([](double value) { return Counter(value); })) + .def_readwrite("value", &Counter::value) + .def_readwrite("flags", &Counter::flags) + .def_readwrite("oneK", &Counter::oneK); + py::implicitly_convertible(); + py::implicitly_convertible(); + + py::bind_map(m, "UserCounters"); + + using benchmark::State; + py::class_(m, "State") + .def("__bool__", &State::KeepRunning) + .def_property_readonly("keep_running", &State::KeepRunning) + .def("pause_timing", &State::PauseTiming) + .def("resume_timing", &State::ResumeTiming) + .def("skip_with_error", &State::SkipWithError) + .def_property_readonly("error_occurred", &State::error_occurred) + .def("set_iteration_time", &State::SetIterationTime) + .def_property("bytes_processed", &State::bytes_processed, + &State::SetBytesProcessed) + .def_property("complexity_n", &State::complexity_length_n, + &State::SetComplexityN) + .def_property("items_processed", &State::items_processed, + &State::SetItemsProcessed) + .def("set_label", (void (State::*)(const char*)) & State::SetLabel) + .def("range", &State::range, py::arg("pos") = 0) + .def_property_readonly("iterations", &State::iterations) + .def_readwrite("counters", &State::counters) + .def_property_readonly("thread_index", &State::thread_index) + .def_property_readonly("threads", &State::threads); + + m.def("Initialize", Initialize); + m.def("RegisterBenchmark", RegisterBenchmark, + py::return_value_policy::reference); + m.def("RunSpecifiedBenchmarks", + []() { benchmark::RunSpecifiedBenchmarks(); }); + m.def("ClearRegisteredBenchmarks", benchmark::ClearRegisteredBenchmarks); +}; +} // namespace diff -Nru beautifulsoup4-4.10.0/bindings/python/google_benchmark/BUILD beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/BUILD --- beautifulsoup4-4.10.0/bindings/python/google_benchmark/BUILD 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/BUILD 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,38 @@ +load("//bindings/python:build_defs.bzl", "py_extension") + +py_library( + name = "google_benchmark", + srcs = ["__init__.py"], + visibility = ["//visibility:public"], + deps = [ + ":_benchmark", + # pip; absl:app + ], +) + +py_extension( + name = "_benchmark", + srcs = ["benchmark.cc"], + copts = [ + "-fexceptions", + "-fno-strict-aliasing", + ], + features = ["-use_header_modules"], + deps = [ + "//:benchmark", + "@pybind11", + "@python_headers", + ], +) + +py_test( + name = "example", + srcs = ["example.py"], + python_version = "PY3", + srcs_version = "PY3", + visibility = ["//visibility:public"], + deps = [ + ":google_benchmark", + ], +) + diff -Nru beautifulsoup4-4.10.0/bindings/python/google_benchmark/example.py beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/example.py --- beautifulsoup4-4.10.0/bindings/python/google_benchmark/example.py 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/example.py 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,136 @@ +# Copyright 2020 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Example of Python using C++ benchmark framework. + +To run this example, you must first install the `google_benchmark` Python package. + +To install using `setup.py`, download and extract the `google_benchmark` source. +In the extracted directory, execute: + python setup.py install +""" + +import random +import time + +import google_benchmark as benchmark +from google_benchmark import Counter + + +@benchmark.register +def empty(state): + while state: + pass + + +@benchmark.register +def sum_million(state): + while state: + sum(range(1_000_000)) + +@benchmark.register +def pause_timing(state): + """Pause timing every iteration.""" + while state: + # Construct a list of random ints every iteration without timing it + state.pause_timing() + random_list = [random.randint(0, 100) for _ in range(100)] + state.resume_timing() + # Time the in place sorting algorithm + random_list.sort() + + +@benchmark.register +def skipped(state): + if True: # Test some predicate here. + state.skip_with_error("some error") + return # NOTE: You must explicitly return, or benchmark will continue. + + ... # Benchmark code would be here. + + +@benchmark.register +def manual_timing(state): + while state: + # Manually count Python CPU time + start = time.perf_counter() # perf_counter_ns() in Python 3.7+ + # Something to benchmark + time.sleep(0.01) + end = time.perf_counter() + state.set_iteration_time(end - start) + + +@benchmark.register +def custom_counters(state): + """Collect cutom metric using benchmark.Counter.""" + num_foo = 0.0 + while state: + # Benchmark some code here + pass + # Collect some custom metric named foo + num_foo += 0.13 + + # Automatic Counter from numbers. + state.counters["foo"] = num_foo + # Set a counter as a rate. + state.counters["foo_rate"] = Counter(num_foo, Counter.kIsRate) + # Set a counter as an inverse of rate. + state.counters["foo_inv_rate"] = Counter(num_foo, Counter.kIsRate | Counter.kInvert) + # Set a counter as a thread-average quantity. + state.counters["foo_avg"] = Counter(num_foo, Counter.kAvgThreads) + # There's also a combined flag: + state.counters["foo_avg_rate"] = Counter(num_foo, Counter.kAvgThreadsRate) + + +@benchmark.register +@benchmark.option.measure_process_cpu_time() +@benchmark.option.use_real_time() +def with_options(state): + while state: + sum(range(1_000_000)) + + +@benchmark.register(name="sum_million_microseconds") +@benchmark.option.unit(benchmark.kMicrosecond) +def with_options2(state): + while state: + sum(range(1_000_000)) + + +@benchmark.register +@benchmark.option.arg(100) +@benchmark.option.arg(1000) +def passing_argument(state): + while state: + sum(range(state.range(0))) + + +@benchmark.register +@benchmark.option.range(8, limit=8 << 10) +def using_range(state): + while state: + sum(range(state.range(0))) + + +@benchmark.register +@benchmark.option.range_multiplier(2) +@benchmark.option.range(1 << 10, 1 << 18) +@benchmark.option.complexity(benchmark.oN) +def computing_complexity(state): + while state: + sum(range(state.range(0))) + state.complexity_n = state.range(0) + + +if __name__ == "__main__": + benchmark.main() diff -Nru beautifulsoup4-4.10.0/bindings/python/google_benchmark/__init__.py beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/__init__.py --- beautifulsoup4-4.10.0/bindings/python/google_benchmark/__init__.py 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/__init__.py 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,162 @@ +# Copyright 2020 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Python benchmarking utilities. + +Example usage: + import google_benchmark as benchmark + + @benchmark.register + def my_benchmark(state): + ... # Code executed outside `while` loop is not timed. + + while state: + ... # Code executed within `while` loop is timed. + + if __name__ == '__main__': + benchmark.main() +""" +import atexit + +from absl import app +from google_benchmark import _benchmark +from google_benchmark._benchmark import ( + Counter, + kNanosecond, + kMicrosecond, + kMillisecond, + kSecond, + oNone, + o1, + oN, + oNSquared, + oNCubed, + oLogN, + oNLogN, + oAuto, + oLambda, + State, +) + + +__all__ = [ + "register", + "main", + "Counter", + "kNanosecond", + "kMicrosecond", + "kMillisecond", + "kSecond", + "oNone", + "o1", + "oN", + "oNSquared", + "oNCubed", + "oLogN", + "oNLogN", + "oAuto", + "oLambda", + "State", +] + +__version__ = "1.7.1" + + +class __OptionMaker: + """A stateless class to collect benchmark options. + + Collect all decorator calls like @option.range(start=0, limit=1<<5). + """ + + class Options: + """Pure data class to store options calls, along with the benchmarked function.""" + + def __init__(self, func): + self.func = func + self.builder_calls = [] + + @classmethod + def make(cls, func_or_options): + """Make Options from Options or the benchmarked function.""" + if isinstance(func_or_options, cls.Options): + return func_or_options + return cls.Options(func_or_options) + + def __getattr__(self, builder_name): + """Append option call in the Options.""" + + # The function that get returned on @option.range(start=0, limit=1<<5). + def __builder_method(*args, **kwargs): + + # The decorator that get called, either with the benchmared function + # or the previous Options + def __decorator(func_or_options): + options = self.make(func_or_options) + options.builder_calls.append((builder_name, args, kwargs)) + # The decorator returns Options so it is not technically a decorator + # and needs a final call to @regiser + return options + + return __decorator + + return __builder_method + + +# Alias for nicer API. +# We have to instantiate an object, even if stateless, to be able to use __getattr__ +# on option.range +option = __OptionMaker() + + +def register(undefined=None, *, name=None): + """Register function for benchmarking.""" + if undefined is None: + # Decorator is called without parenthesis so we return a decorator + return lambda f: register(f, name=name) + + # We have either the function to benchmark (simple case) or an instance of Options + # (@option._ case). + options = __OptionMaker.make(undefined) + + if name is None: + name = options.func.__name__ + + # We register the benchmark and reproduce all the @option._ calls onto the + # benchmark builder pattern + benchmark = _benchmark.RegisterBenchmark(name, options.func) + for name, args, kwargs in options.builder_calls[::-1]: + getattr(benchmark, name)(*args, **kwargs) + + # return the benchmarked function because the decorator does not modify it + return options.func + + +def _flags_parser(argv): + argv = _benchmark.Initialize(argv) + return app.parse_flags_with_usage(argv) + + +def _run_benchmarks(argv): + if len(argv) > 1: + raise app.UsageError("Too many command-line arguments.") + return _benchmark.RunSpecifiedBenchmarks() + + +def main(argv=None): + return app.run(_run_benchmarks, argv=argv, flags_parser=_flags_parser) + + +# Methods for use with custom main function. +initialize = _benchmark.Initialize +run_benchmarks = _benchmark.RunSpecifiedBenchmarks +atexit.register(_benchmark.ClearRegisteredBenchmarks) diff -Nru beautifulsoup4-4.10.0/bindings/python/pybind11.BUILD beautifulsoup4-1.7.1-benchmark/bindings/python/pybind11.BUILD --- beautifulsoup4-4.10.0/bindings/python/pybind11.BUILD 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/pybind11.BUILD 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,20 @@ +cc_library( + name = "pybind11", + hdrs = glob( + include = [ + "include/pybind11/*.h", + "include/pybind11/detail/*.h", + ], + exclude = [ + "include/pybind11/common.h", + "include/pybind11/eigen.h", + ], + ), + copts = [ + "-fexceptions", + "-Wno-undefined-inline", + "-Wno-pragma-once-outside-header", + ], + includes = ["include"], + visibility = ["//visibility:public"], +) diff -Nru beautifulsoup4-4.10.0/bindings/python/python_headers.BUILD beautifulsoup4-1.7.1-benchmark/bindings/python/python_headers.BUILD --- beautifulsoup4-4.10.0/bindings/python/python_headers.BUILD 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/python_headers.BUILD 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,6 @@ +cc_library( + name = "python_headers", + hdrs = glob(["**/*.h"]), + includes = ["."], + visibility = ["//visibility:public"], +) diff -Nru beautifulsoup4-4.10.0/bindings/python/requirements.txt beautifulsoup4-1.7.1-benchmark/bindings/python/requirements.txt --- beautifulsoup4-4.10.0/bindings/python/requirements.txt 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/requirements.txt 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,2 @@ +absl-py>=0.7.1 + diff -Nru beautifulsoup4-4.10.0/bs4/builder/_html5lib.py beautifulsoup4-1.7.1-benchmark/bs4/builder/_html5lib.py --- beautifulsoup4-4.10.0/bs4/builder/_html5lib.py 2021-09-07 23:36:46.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bs4/builder/_html5lib.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,467 +0,0 @@ -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -__all__ = [ - 'HTML5TreeBuilder', - ] - -import warnings -import re -from bs4.builder import ( - PERMISSIVE, - HTML, - HTML_5, - HTMLTreeBuilder, - ) -from bs4.element import ( - NamespacedAttribute, - nonwhitespace_re, -) -import html5lib -from html5lib.constants import ( - namespaces, - prefixes, - ) -from bs4.element import ( - Comment, - Doctype, - NavigableString, - Tag, - ) - -try: - # Pre-0.99999999 - from html5lib.treebuilders import _base as treebuilder_base - new_html5lib = False -except ImportError as e: - # 0.99999999 and up - from html5lib.treebuilders import base as treebuilder_base - new_html5lib = True - -class HTML5TreeBuilder(HTMLTreeBuilder): - """Use html5lib to build a tree. - - Note that this TreeBuilder does not support some features common - to HTML TreeBuilders. Some of these features could theoretically - be implemented, but at the very least it's quite difficult, - because html5lib moves the parse tree around as it's being built. - - * This TreeBuilder doesn't use different subclasses of NavigableString - based on the name of the tag in which the string was found. - - * You can't use a SoupStrainer to parse only part of a document. - """ - - NAME = "html5lib" - - features = [NAME, PERMISSIVE, HTML_5, HTML] - - # html5lib can tell us which line number and position in the - # original file is the source of an element. - TRACKS_LINE_NUMBERS = True - - def prepare_markup(self, markup, user_specified_encoding, - document_declared_encoding=None, exclude_encodings=None): - # Store the user-specified encoding for use later on. - self.user_specified_encoding = user_specified_encoding - - # document_declared_encoding and exclude_encodings aren't used - # ATM because the html5lib TreeBuilder doesn't use - # UnicodeDammit. - if exclude_encodings: - warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") - yield (markup, None, None, False) - - # These methods are defined by Beautiful Soup. - def feed(self, markup): - if self.soup.parse_only is not None: - warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") - parser = html5lib.HTMLParser(tree=self.create_treebuilder) - self.underlying_builder.parser = parser - extra_kwargs = dict() - if not isinstance(markup, str): - if new_html5lib: - extra_kwargs['override_encoding'] = self.user_specified_encoding - else: - extra_kwargs['encoding'] = self.user_specified_encoding - doc = parser.parse(markup, **extra_kwargs) - - # Set the character encoding detected by the tokenizer. - if isinstance(markup, str): - # We need to special-case this because html5lib sets - # charEncoding to UTF-8 if it gets Unicode input. - doc.original_encoding = None - else: - original_encoding = parser.tokenizer.stream.charEncoding[0] - if not isinstance(original_encoding, str): - # In 0.99999999 and up, the encoding is an html5lib - # Encoding object. We want to use a string for compatibility - # with other tree builders. - original_encoding = original_encoding.name - doc.original_encoding = original_encoding - self.underlying_builder.parser = None - - def create_treebuilder(self, namespaceHTMLElements): - self.underlying_builder = TreeBuilderForHtml5lib( - namespaceHTMLElements, self.soup, - store_line_numbers=self.store_line_numbers - ) - return self.underlying_builder - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return '%s' % fragment - - -class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): - - def __init__(self, namespaceHTMLElements, soup=None, - store_line_numbers=True, **kwargs): - if soup: - self.soup = soup - else: - from bs4 import BeautifulSoup - # TODO: Why is the parser 'html.parser' here? To avoid an - # infinite loop? - self.soup = BeautifulSoup( - "", "html.parser", store_line_numbers=store_line_numbers, - **kwargs - ) - # TODO: What are **kwargs exactly? Should they be passed in - # here in addition to/instead of being passed to the BeautifulSoup - # constructor? - super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) - - # This will be set later to an html5lib.html5parser.HTMLParser - # object, which we can use to track the current line number. - self.parser = None - self.store_line_numbers = store_line_numbers - - def documentClass(self): - self.soup.reset() - return Element(self.soup, self.soup, None) - - def insertDoctype(self, token): - name = token["name"] - publicId = token["publicId"] - systemId = token["systemId"] - - doctype = Doctype.for_name_and_ids(name, publicId, systemId) - self.soup.object_was_parsed(doctype) - - def elementClass(self, name, namespace): - kwargs = {} - if self.parser and self.store_line_numbers: - # This represents the point immediately after the end of the - # tag. We don't know when the tag started, but we do know - # where it ended -- the character just before this one. - sourceline, sourcepos = self.parser.tokenizer.stream.position() - kwargs['sourceline'] = sourceline - kwargs['sourcepos'] = sourcepos-1 - tag = self.soup.new_tag(name, namespace, **kwargs) - - return Element(tag, self.soup, namespace) - - def commentClass(self, data): - return TextNode(Comment(data), self.soup) - - def fragmentClass(self): - from bs4 import BeautifulSoup - # TODO: Why is the parser 'html.parser' here? To avoid an - # infinite loop? - self.soup = BeautifulSoup("", "html.parser") - self.soup.name = "[document_fragment]" - return Element(self.soup, self.soup, None) - - def appendChild(self, node): - # XXX This code is not covered by the BS4 tests. - self.soup.append(node.element) - - def getDocument(self): - return self.soup - - def getFragment(self): - return treebuilder_base.TreeBuilder.getFragment(self).element - - def testSerializer(self, element): - from bs4 import BeautifulSoup - rv = [] - doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') - - def serializeElement(element, indent=0): - if isinstance(element, BeautifulSoup): - pass - if isinstance(element, Doctype): - m = doctype_re.match(element) - if m: - name = m.group(1) - if m.lastindex > 1: - publicId = m.group(2) or "" - systemId = m.group(3) or m.group(4) or "" - rv.append("""|%s""" % - (' ' * indent, name, publicId, systemId)) - else: - rv.append("|%s" % (' ' * indent, name)) - else: - rv.append("|%s" % (' ' * indent,)) - elif isinstance(element, Comment): - rv.append("|%s" % (' ' * indent, element)) - elif isinstance(element, NavigableString): - rv.append("|%s\"%s\"" % (' ' * indent, element)) - else: - if element.namespace: - name = "%s %s" % (prefixes[element.namespace], - element.name) - else: - name = element.name - rv.append("|%s<%s>" % (' ' * indent, name)) - if element.attrs: - attributes = [] - for name, value in list(element.attrs.items()): - if isinstance(name, NamespacedAttribute): - name = "%s %s" % (prefixes[name.namespace], name.name) - if isinstance(value, list): - value = " ".join(value) - attributes.append((name, value)) - - for name, value in sorted(attributes): - rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) - indent += 2 - for child in element.children: - serializeElement(child, indent) - serializeElement(element, 0) - - return "\n".join(rv) - -class AttrList(object): - def __init__(self, element): - self.element = element - self.attrs = dict(self.element.attrs) - def __iter__(self): - return list(self.attrs.items()).__iter__() - def __setitem__(self, name, value): - # If this attribute is a multi-valued attribute for this element, - # turn its value into a list. - list_attr = self.element.cdata_list_attributes - if (name in list_attr['*'] - or (self.element.name in list_attr - and name in list_attr[self.element.name])): - # A node that is being cloned may have already undergone - # this procedure. - if not isinstance(value, list): - value = nonwhitespace_re.findall(value) - self.element[name] = value - def items(self): - return list(self.attrs.items()) - def keys(self): - return list(self.attrs.keys()) - def __len__(self): - return len(self.attrs) - def __getitem__(self, name): - return self.attrs[name] - def __contains__(self, name): - return name in list(self.attrs.keys()) - - -class Element(treebuilder_base.Node): - def __init__(self, element, soup, namespace): - treebuilder_base.Node.__init__(self, element.name) - self.element = element - self.soup = soup - self.namespace = namespace - - def appendChild(self, node): - string_child = child = None - if isinstance(node, str): - # Some other piece of code decided to pass in a string - # instead of creating a TextElement object to contain the - # string. - string_child = child = node - elif isinstance(node, Tag): - # Some other piece of code decided to pass in a Tag - # instead of creating an Element object to contain the - # Tag. - child = node - elif node.element.__class__ == NavigableString: - string_child = child = node.element - node.parent = self - else: - child = node.element - node.parent = self - - if not isinstance(child, str) and child.parent is not None: - node.element.extract() - - if (string_child is not None and self.element.contents - and self.element.contents[-1].__class__ == NavigableString): - # We are appending a string onto another string. - # TODO This has O(n^2) performance, for input like - # "aaa..." - old_element = self.element.contents[-1] - new_element = self.soup.new_string(old_element + string_child) - old_element.replace_with(new_element) - self.soup._most_recent_element = new_element - else: - if isinstance(node, str): - # Create a brand new NavigableString from this string. - child = self.soup.new_string(node) - - # Tell Beautiful Soup to act as if it parsed this element - # immediately after the parent's last descendant. (Or - # immediately after the parent, if it has no children.) - if self.element.contents: - most_recent_element = self.element._last_descendant(False) - elif self.element.next_element is not None: - # Something from further ahead in the parse tree is - # being inserted into this earlier element. This is - # very annoying because it means an expensive search - # for the last element in the tree. - most_recent_element = self.soup._last_descendant() - else: - most_recent_element = self.element - - self.soup.object_was_parsed( - child, parent=self.element, - most_recent_element=most_recent_element) - - def getAttributes(self): - if isinstance(self.element, Comment): - return {} - return AttrList(self.element) - - def setAttributes(self, attributes): - if attributes is not None and len(attributes) > 0: - converted_attributes = [] - for name, value in list(attributes.items()): - if isinstance(name, tuple): - new_name = NamespacedAttribute(*name) - del attributes[name] - attributes[new_name] = value - - self.soup.builder._replace_cdata_list_attribute_values( - self.name, attributes) - for name, value in list(attributes.items()): - self.element[name] = value - - # The attributes may contain variables that need substitution. - # Call set_up_substitutions manually. - # - # The Tag constructor called this method when the Tag was created, - # but we just set/changed the attributes, so call it again. - self.soup.builder.set_up_substitutions(self.element) - attributes = property(getAttributes, setAttributes) - - def insertText(self, data, insertBefore=None): - text = TextNode(self.soup.new_string(data), self.soup) - if insertBefore: - self.insertBefore(text, insertBefore) - else: - self.appendChild(text) - - def insertBefore(self, node, refNode): - index = self.element.index(refNode.element) - if (node.element.__class__ == NavigableString and self.element.contents - and self.element.contents[index-1].__class__ == NavigableString): - # (See comments in appendChild) - old_node = self.element.contents[index-1] - new_str = self.soup.new_string(old_node + node.element) - old_node.replace_with(new_str) - else: - self.element.insert(index, node.element) - node.parent = self - - def removeChild(self, node): - node.element.extract() - - def reparentChildren(self, new_parent): - """Move all of this tag's children into another tag.""" - # print("MOVE", self.element.contents) - # print("FROM", self.element) - # print("TO", new_parent.element) - - element = self.element - new_parent_element = new_parent.element - # Determine what this tag's next_element will be once all the children - # are removed. - final_next_element = element.next_sibling - - new_parents_last_descendant = new_parent_element._last_descendant(False, False) - if len(new_parent_element.contents) > 0: - # The new parent already contains children. We will be - # appending this tag's children to the end. - new_parents_last_child = new_parent_element.contents[-1] - new_parents_last_descendant_next_element = new_parents_last_descendant.next_element - else: - # The new parent contains no children. - new_parents_last_child = None - new_parents_last_descendant_next_element = new_parent_element.next_element - - to_append = element.contents - if len(to_append) > 0: - # Set the first child's previous_element and previous_sibling - # to elements within the new parent - first_child = to_append[0] - if new_parents_last_descendant is not None: - first_child.previous_element = new_parents_last_descendant - else: - first_child.previous_element = new_parent_element - first_child.previous_sibling = new_parents_last_child - if new_parents_last_descendant is not None: - new_parents_last_descendant.next_element = first_child - else: - new_parent_element.next_element = first_child - if new_parents_last_child is not None: - new_parents_last_child.next_sibling = first_child - - # Find the very last element being moved. It is now the - # parent's last descendant. It has no .next_sibling and - # its .next_element is whatever the previous last - # descendant had. - last_childs_last_descendant = to_append[-1]._last_descendant(False, True) - - last_childs_last_descendant.next_element = new_parents_last_descendant_next_element - if new_parents_last_descendant_next_element is not None: - # TODO: This code has no test coverage and I'm not sure - # how to get html5lib to go through this path, but it's - # just the other side of the previous line. - new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant - last_childs_last_descendant.next_sibling = None - - for child in to_append: - child.parent = new_parent_element - new_parent_element.contents.append(child) - - # Now that this element has no children, change its .next_element. - element.contents = [] - element.next_element = final_next_element - - # print("DONE WITH MOVE") - # print("FROM", self.element) - # print("TO", new_parent_element) - - def cloneNode(self): - tag = self.soup.new_tag(self.element.name, self.namespace) - node = Element(tag, self.soup, self.namespace) - for key,value in self.attributes: - node.attributes[key] = value - return node - - def hasContent(self): - return self.element.contents - - def getNameTuple(self): - if self.namespace == None: - return namespaces["html"], self.name - else: - return self.namespace, self.name - - nameTuple = property(getNameTuple) - -class TextNode(Element): - def __init__(self, element, soup): - treebuilder_base.Node.__init__(self, None) - self.element = element - self.soup = soup - - def cloneNode(self): - raise NotImplementedError diff -Nru beautifulsoup4-4.10.0/bs4/builder/_htmlparser.py beautifulsoup4-1.7.1-benchmark/bs4/builder/_htmlparser.py --- beautifulsoup4-4.10.0/bs4/builder/_htmlparser.py 2021-09-07 23:36:46.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bs4/builder/_htmlparser.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,492 +0,0 @@ -# encoding: utf-8 -"""Use the HTMLParser library to parse HTML files that aren't too bad.""" - -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -__all__ = [ - 'HTMLParserTreeBuilder', - ] - -from html.parser import HTMLParser - -try: - from html.parser import HTMLParseError -except ImportError as e: - # HTMLParseError is removed in Python 3.5. Since it can never be - # thrown in 3.5, we can just define our own class as a placeholder. - class HTMLParseError(Exception): - pass - -import sys -import warnings - -# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' -# argument, which we'd like to set to False. Unfortunately, -# http://bugs.python.org/issue13273 makes strict=True a better bet -# before Python 3.2.3. -# -# At the end of this file, we monkeypatch HTMLParser so that -# strict=True works well on Python 3.2.2. -major, minor, release = sys.version_info[:3] -CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 -CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 -CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 - - -from bs4.element import ( - CData, - Comment, - Declaration, - Doctype, - ProcessingInstruction, - ) -from bs4.dammit import EntitySubstitution, UnicodeDammit - -from bs4.builder import ( - HTML, - HTMLTreeBuilder, - STRICT, - ) - - -HTMLPARSER = 'html.parser' - -class BeautifulSoupHTMLParser(HTMLParser): - """A subclass of the Python standard library's HTMLParser class, which - listens for HTMLParser events and translates them into calls - to Beautiful Soup's tree construction API. - """ - - # Strategies for handling duplicate attributes - IGNORE = 'ignore' - REPLACE = 'replace' - - def __init__(self, *args, **kwargs): - """Constructor. - - :param on_duplicate_attribute: A strategy for what to do if a - tag includes the same attribute more than once. Accepted - values are: REPLACE (replace earlier values with later - ones, the default), IGNORE (keep the earliest value - encountered), or a callable. A callable must take three - arguments: the dictionary of attributes already processed, - the name of the duplicate attribute, and the most recent value - encountered. - """ - self.on_duplicate_attribute = kwargs.pop( - 'on_duplicate_attribute', self.REPLACE - ) - HTMLParser.__init__(self, *args, **kwargs) - - # Keep a list of empty-element tags that were encountered - # without an explicit closing tag. If we encounter a closing tag - # of this type, we'll associate it with one of those entries. - # - # This isn't a stack because we don't care about the - # order. It's a list of closing tags we've already handled and - # will ignore, assuming they ever show up. - self.already_closed_empty_element = [] - - def error(self, msg): - """In Python 3, HTMLParser subclasses must implement error(), although - this requirement doesn't appear to be documented. - - In Python 2, HTMLParser implements error() by raising an exception, - which we don't want to do. - - In any event, this method is called only on very strange - markup and our best strategy is to pretend it didn't happen - and keep going. - """ - warnings.warn(msg) - - def handle_startendtag(self, name, attrs): - """Handle an incoming empty-element tag. - - This is only called when the markup looks like . - - :param name: Name of the tag. - :param attrs: Dictionary of the tag's attributes. - """ - # is_startend() tells handle_starttag not to close the tag - # just because its name matches a known empty-element tag. We - # know that this is an empty-element tag and we want to call - # handle_endtag ourselves. - tag = self.handle_starttag(name, attrs, handle_empty_element=False) - self.handle_endtag(name) - - def handle_starttag(self, name, attrs, handle_empty_element=True): - """Handle an opening tag, e.g. '' - - :param name: Name of the tag. - :param attrs: Dictionary of the tag's attributes. - :param handle_empty_element: True if this tag is known to be - an empty-element tag (i.e. there is not expected to be any - closing tag). - """ - # XXX namespace - attr_dict = {} - for key, value in attrs: - # Change None attribute values to the empty string - # for consistency with the other tree builders. - if value is None: - value = '' - if key in attr_dict: - # A single attribute shows up multiple times in this - # tag. How to handle it depends on the - # on_duplicate_attribute setting. - on_dupe = self.on_duplicate_attribute - if on_dupe == self.IGNORE: - pass - elif on_dupe in (None, self.REPLACE): - attr_dict[key] = value - else: - on_dupe(attr_dict, key, value) - else: - attr_dict[key] = value - attrvalue = '""' - #print("START", name) - sourceline, sourcepos = self.getpos() - tag = self.soup.handle_starttag( - name, None, None, attr_dict, sourceline=sourceline, - sourcepos=sourcepos - ) - if tag and tag.is_empty_element and handle_empty_element: - # Unlike other parsers, html.parser doesn't send separate end tag - # events for empty-element tags. (It's handled in - # handle_startendtag, but only if the original markup looked like - # .) - # - # So we need to call handle_endtag() ourselves. Since we - # know the start event is identical to the end event, we - # don't want handle_endtag() to cross off any previous end - # events for tags of this name. - self.handle_endtag(name, check_already_closed=False) - - # But we might encounter an explicit closing tag for this tag - # later on. If so, we want to ignore it. - self.already_closed_empty_element.append(name) - - def handle_endtag(self, name, check_already_closed=True): - """Handle a closing tag, e.g. '' - - :param name: A tag name. - :param check_already_closed: True if this tag is expected to - be the closing portion of an empty-element tag, - e.g. ''. - """ - #print("END", name) - if check_already_closed and name in self.already_closed_empty_element: - # This is a redundant end tag for an empty-element tag. - # We've already called handle_endtag() for it, so just - # check it off the list. - #print("ALREADY CLOSED", name) - self.already_closed_empty_element.remove(name) - else: - self.soup.handle_endtag(name) - - def handle_data(self, data): - """Handle some textual data that shows up between tags.""" - self.soup.handle_data(data) - - def handle_charref(self, name): - """Handle a numeric character reference by converting it to the - corresponding Unicode character and treating it as textual - data. - - :param name: Character number, possibly in hexadecimal. - """ - # XXX workaround for a bug in HTMLParser. Remove this once - # it's fixed in all supported versions. - # http://bugs.python.org/issue13633 - if name.startswith('x'): - real_name = int(name.lstrip('x'), 16) - elif name.startswith('X'): - real_name = int(name.lstrip('X'), 16) - else: - real_name = int(name) - - data = None - if real_name < 256: - # HTML numeric entities are supposed to reference Unicode - # code points, but sometimes they reference code points in - # some other encoding (ahem, Windows-1252). E.g. “ - # instead of É for LEFT DOUBLE QUOTATION MARK. This - # code tries to detect this situation and compensate. - for encoding in (self.soup.original_encoding, 'windows-1252'): - if not encoding: - continue - try: - data = bytearray([real_name]).decode(encoding) - except UnicodeDecodeError as e: - pass - if not data: - try: - data = chr(real_name) - except (ValueError, OverflowError) as e: - pass - data = data or "\N{REPLACEMENT CHARACTER}" - self.handle_data(data) - - def handle_entityref(self, name): - """Handle a named entity reference by converting it to the - corresponding Unicode character(s) and treating it as textual - data. - - :param name: Name of the entity reference. - """ - character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) - if character is not None: - data = character - else: - # If this were XML, it would be ambiguous whether "&foo" - # was an character entity reference with a missing - # semicolon or the literal string "&foo". Since this is - # HTML, we have a complete list of all character entity references, - # and this one wasn't found, so assume it's the literal string "&foo". - data = "&%s" % name - self.handle_data(data) - - def handle_comment(self, data): - """Handle an HTML comment. - - :param data: The text of the comment. - """ - self.soup.endData() - self.soup.handle_data(data) - self.soup.endData(Comment) - - def handle_decl(self, data): - """Handle a DOCTYPE declaration. - - :param data: The text of the declaration. - """ - self.soup.endData() - data = data[len("DOCTYPE "):] - self.soup.handle_data(data) - self.soup.endData(Doctype) - - def unknown_decl(self, data): - """Handle a declaration of unknown type -- probably a CDATA block. - - :param data: The text of the declaration. - """ - if data.upper().startswith('CDATA['): - cls = CData - data = data[len('CDATA['):] - else: - cls = Declaration - self.soup.endData() - self.soup.handle_data(data) - self.soup.endData(cls) - - def handle_pi(self, data): - """Handle a processing instruction. - - :param data: The text of the instruction. - """ - self.soup.endData() - self.soup.handle_data(data) - self.soup.endData(ProcessingInstruction) - - -class HTMLParserTreeBuilder(HTMLTreeBuilder): - """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser, - found in the Python standard library. - """ - is_xml = False - picklable = True - NAME = HTMLPARSER - features = [NAME, HTML, STRICT] - - # The html.parser knows which line number and position in the - # original file is the source of an element. - TRACKS_LINE_NUMBERS = True - - def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): - """Constructor. - - :param parser_args: Positional arguments to pass into - the BeautifulSoupHTMLParser constructor, once it's - invoked. - :param parser_kwargs: Keyword arguments to pass into - the BeautifulSoupHTMLParser constructor, once it's - invoked. - :param kwargs: Keyword arguments for the superclass constructor. - """ - # Some keyword arguments will be pulled out of kwargs and placed - # into parser_kwargs. - extra_parser_kwargs = dict() - for arg in ('on_duplicate_attribute',): - if arg in kwargs: - value = kwargs.pop(arg) - extra_parser_kwargs[arg] = value - super(HTMLParserTreeBuilder, self).__init__(**kwargs) - parser_args = parser_args or [] - parser_kwargs = parser_kwargs or {} - parser_kwargs.update(extra_parser_kwargs) - if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: - parser_kwargs['strict'] = False - if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: - parser_kwargs['convert_charrefs'] = False - self.parser_args = (parser_args, parser_kwargs) - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None, exclude_encodings=None): - - """Run any preliminary steps necessary to make incoming markup - acceptable to the parser. - - :param markup: Some markup -- probably a bytestring. - :param user_specified_encoding: The user asked to try this encoding. - :param document_declared_encoding: The markup itself claims to be - in this encoding. - :param exclude_encodings: The user asked _not_ to try any of - these encodings. - - :yield: A series of 4-tuples: - (markup, encoding, declared encoding, - has undergone character replacement) - - Each 4-tuple represents a strategy for converting the - document to Unicode and parsing it. Each strategy will be tried - in turn. - """ - if isinstance(markup, str): - # Parse Unicode as-is. - yield (markup, None, None, False) - return - - # Ask UnicodeDammit to sniff the most likely encoding. - - # This was provided by the end-user; treat it as a known - # definite encoding per the algorithm laid out in the HTML5 - # spec. (See the EncodingDetector class for details.) - known_definite_encodings = [user_specified_encoding] - - # This was found in the document; treat it as a slightly lower-priority - # user encoding. - user_encodings = [document_declared_encoding] - - try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit( - markup, - known_definite_encodings=known_definite_encodings, - user_encodings=user_encodings, - is_html=True, - exclude_encodings=exclude_encodings - ) - yield (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) - - def feed(self, markup): - """Run some incoming markup through some parsing process, - populating the `BeautifulSoup` object in self.soup. - """ - args, kwargs = self.parser_args - parser = BeautifulSoupHTMLParser(*args, **kwargs) - parser.soup = self.soup - try: - parser.feed(markup) - parser.close() - except HTMLParseError as e: - warnings.warn(RuntimeWarning( - "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) - raise e - parser.already_closed_empty_element = [] - -# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some -# 3.2.3 code. This ensures they don't treat markup like

as a -# string. -# -# XXX This code can be removed once most Python 3 users are on 3.2.3. -if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: - import re - attrfind_tolerant = re.compile( - r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' - r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') - HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant - - locatestarttagend = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value - ) - )? - ) - )* - \s* # trailing whitespace -""", re.VERBOSE) - BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend - - from html.parser import tagfind, attrfind - - def parse_starttag(self, i): - self.__starttag_text = None - endpos = self.check_for_whole_start_tag(i) - if endpos < 0: - return endpos - rawdata = self.rawdata - self.__starttag_text = rawdata[i:endpos] - - # Now parse the data between i+1 and j into a tag and attrs - attrs = [] - match = tagfind.match(rawdata, i+1) - assert match, 'unexpected call to parse_starttag()' - k = match.end() - self.lasttag = tag = rawdata[i+1:k].lower() - while k < endpos: - if self.strict: - m = attrfind.match(rawdata, k) - else: - m = attrfind_tolerant.match(rawdata, k) - if not m: - break - attrname, rest, attrvalue = m.group(1, 2, 3) - if not rest: - attrvalue = None - elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ - attrvalue[:1] == '"' == attrvalue[-1:]: - attrvalue = attrvalue[1:-1] - if attrvalue: - attrvalue = self.unescape(attrvalue) - attrs.append((attrname.lower(), attrvalue)) - k = m.end() - - end = rawdata[k:endpos].strip() - if end not in (">", "/>"): - lineno, offset = self.getpos() - if "\n" in self.__starttag_text: - lineno = lineno + self.__starttag_text.count("\n") - offset = len(self.__starttag_text) \ - - self.__starttag_text.rfind("\n") - else: - offset = offset + len(self.__starttag_text) - if self.strict: - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) - self.handle_data(rawdata[i:endpos]) - return endpos - if end.endswith('/>'): - # XHTML-style empty tag: - self.handle_startendtag(tag, attrs) - else: - self.handle_starttag(tag, attrs) - if tag in self.CDATA_CONTENT_ELEMENTS: - self.set_cdata_mode(tag) - return endpos - - def set_cdata_mode(self, elem): - self.cdata_elem = elem.lower() - self.interesting = re.compile(r'' % self.cdata_elem, re.I) - - BeautifulSoupHTMLParser.parse_starttag = parse_starttag - BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode - - CONSTRUCTOR_TAKES_STRICT = True diff -Nru beautifulsoup4-4.10.0/bs4/builder/__init__.py beautifulsoup4-1.7.1-benchmark/bs4/builder/__init__.py --- beautifulsoup4-4.10.0/bs4/builder/__init__.py 2021-09-07 23:36:46.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bs4/builder/__init__.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,520 +0,0 @@ -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -from collections import defaultdict -import itertools -import sys -from bs4.element import ( - CharsetMetaAttributeValue, - ContentMetaAttributeValue, - Stylesheet, - Script, - TemplateString, - nonwhitespace_re -) - -__all__ = [ - 'HTMLTreeBuilder', - 'SAXTreeBuilder', - 'TreeBuilder', - 'TreeBuilderRegistry', - ] - -# Some useful features for a TreeBuilder to have. -FAST = 'fast' -PERMISSIVE = 'permissive' -STRICT = 'strict' -XML = 'xml' -HTML = 'html' -HTML_5 = 'html5' - - -class TreeBuilderRegistry(object): - """A way of looking up TreeBuilder subclasses by their name or by desired - features. - """ - - def __init__(self): - self.builders_for_feature = defaultdict(list) - self.builders = [] - - def register(self, treebuilder_class): - """Register a treebuilder based on its advertised features. - - :param treebuilder_class: A subclass of Treebuilder. its .features - attribute should list its features. - """ - for feature in treebuilder_class.features: - self.builders_for_feature[feature].insert(0, treebuilder_class) - self.builders.insert(0, treebuilder_class) - - def lookup(self, *features): - """Look up a TreeBuilder subclass with the desired features. - - :param features: A list of features to look for. If none are - provided, the most recently registered TreeBuilder subclass - will be used. - :return: A TreeBuilder subclass, or None if there's no - registered subclass with all the requested features. - """ - if len(self.builders) == 0: - # There are no builders at all. - return None - - if len(features) == 0: - # They didn't ask for any features. Give them the most - # recently registered builder. - return self.builders[0] - - # Go down the list of features in order, and eliminate any builders - # that don't match every feature. - features = list(features) - features.reverse() - candidates = None - candidate_set = None - while len(features) > 0: - feature = features.pop() - we_have_the_feature = self.builders_for_feature.get(feature, []) - if len(we_have_the_feature) > 0: - if candidates is None: - candidates = we_have_the_feature - candidate_set = set(candidates) - else: - # Eliminate any candidates that don't have this feature. - candidate_set = candidate_set.intersection( - set(we_have_the_feature)) - - # The only valid candidates are the ones in candidate_set. - # Go through the original list of candidates and pick the first one - # that's in candidate_set. - if candidate_set is None: - return None - for candidate in candidates: - if candidate in candidate_set: - return candidate - return None - -# The BeautifulSoup class will take feature lists from developers and use them -# to look up builders in this registry. -builder_registry = TreeBuilderRegistry() - -class TreeBuilder(object): - """Turn a textual document into a Beautiful Soup object tree.""" - - NAME = "[Unknown tree builder]" - ALTERNATE_NAMES = [] - features = [] - - is_xml = False - picklable = False - empty_element_tags = None # A tag will be considered an empty-element - # tag when and only when it has no contents. - - # A value for these tag/attribute combinations is a space- or - # comma-separated list of CDATA, rather than a single CDATA. - DEFAULT_CDATA_LIST_ATTRIBUTES = {} - - # Whitespace should be preserved inside these tags. - DEFAULT_PRESERVE_WHITESPACE_TAGS = set() - - # The textual contents of tags with these names should be - # instantiated with some class other than NavigableString. - DEFAULT_STRING_CONTAINERS = {} - - USE_DEFAULT = object() - - # Most parsers don't keep track of line numbers. - TRACKS_LINE_NUMBERS = False - - def __init__(self, multi_valued_attributes=USE_DEFAULT, - preserve_whitespace_tags=USE_DEFAULT, - store_line_numbers=USE_DEFAULT, - string_containers=USE_DEFAULT, - ): - """Constructor. - - :param multi_valued_attributes: If this is set to None, the - TreeBuilder will not turn any values for attributes like - 'class' into lists. Setting this to a dictionary will - customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES - for an example. - - Internally, these are called "CDATA list attributes", but that - probably doesn't make sense to an end-user, so the argument name - is `multi_valued_attributes`. - - :param preserve_whitespace_tags: A list of tags to treat - the way
 tags are treated in HTML. Tags in this list
-         are immune from pretty-printing; their contents will always be
-         output as-is.
-
-        :param string_containers: A dictionary mapping tag names to
-        the classes that should be instantiated to contain the textual
-        contents of those tags. The default is to use NavigableString
-        for every tag, no matter what the name. You can override the
-        default by changing DEFAULT_STRING_CONTAINERS.
-
-        :param store_line_numbers: If the parser keeps track of the
-         line numbers and positions of the original markup, that
-         information will, by default, be stored in each corresponding
-         `Tag` object. You can turn this off by passing
-         store_line_numbers=False. If the parser you're using doesn't 
-         keep track of this information, then setting store_line_numbers=True
-         will do nothing.
-        """
-        self.soup = None
-        if multi_valued_attributes is self.USE_DEFAULT:
-            multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
-        self.cdata_list_attributes = multi_valued_attributes
-        if preserve_whitespace_tags is self.USE_DEFAULT:
-            preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
-        self.preserve_whitespace_tags = preserve_whitespace_tags
-        if store_line_numbers == self.USE_DEFAULT:
-            store_line_numbers = self.TRACKS_LINE_NUMBERS
-        self.store_line_numbers = store_line_numbers 
-        if string_containers == self.USE_DEFAULT:
-            string_containers = self.DEFAULT_STRING_CONTAINERS
-        self.string_containers = string_containers
-        
-    def initialize_soup(self, soup):
-        """The BeautifulSoup object has been initialized and is now
-        being associated with the TreeBuilder.
-
-        :param soup: A BeautifulSoup object.
-        """
-        self.soup = soup
-        
-    def reset(self):
-        """Do any work necessary to reset the underlying parser
-        for a new document.
-
-        By default, this does nothing.
-        """
-        pass
-
-    def can_be_empty_element(self, tag_name):
-        """Might a tag with this name be an empty-element tag?
-
-        The final markup may or may not actually present this tag as
-        self-closing.
-
-        For instance: an HTMLBuilder does not consider a 

tag to be - an empty-element tag (it's not in - HTMLBuilder.empty_element_tags). This means an empty

tag - will be presented as "

", not "

" or "

". - - The default implementation has no opinion about which tags are - empty-element tags, so a tag will be presented as an - empty-element tag if and only if it has no children. - "" will become "", and "bar" will - be left alone. - - :param tag_name: The name of a markup tag. - """ - if self.empty_element_tags is None: - return True - return tag_name in self.empty_element_tags - - def feed(self, markup): - """Run some incoming markup through some parsing process, - populating the `BeautifulSoup` object in self.soup. - - This method is not implemented in TreeBuilder; it must be - implemented in subclasses. - - :return: None. - """ - raise NotImplementedError() - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None, exclude_encodings=None): - """Run any preliminary steps necessary to make incoming markup - acceptable to the parser. - - :param markup: Some markup -- probably a bytestring. - :param user_specified_encoding: The user asked to try this encoding. - :param document_declared_encoding: The markup itself claims to be - in this encoding. NOTE: This argument is not used by the - calling code and can probably be removed. - :param exclude_encodings: The user asked _not_ to try any of - these encodings. - - :yield: A series of 4-tuples: - (markup, encoding, declared encoding, - has undergone character replacement) - - Each 4-tuple represents a strategy for converting the - document to Unicode and parsing it. Each strategy will be tried - in turn. - - By default, the only strategy is to parse the markup - as-is. See `LXMLTreeBuilderForXML` and - `HTMLParserTreeBuilder` for implementations that take into - account the quirks of particular parsers. - """ - yield markup, None, None, False - - def test_fragment_to_document(self, fragment): - """Wrap an HTML fragment to make it look like a document. - - Different parsers do this differently. For instance, lxml - introduces an empty tag, and html5lib - doesn't. Abstracting this away lets us write simple tests - which run HTML fragments through the parser and compare the - results against other HTML fragments. - - This method should not be used outside of tests. - - :param fragment: A string -- fragment of HTML. - :return: A string -- a full HTML document. - """ - return fragment - - def set_up_substitutions(self, tag): - """Set up any substitutions that will need to be performed on - a `Tag` when it's output as a string. - - By default, this does nothing. See `HTMLTreeBuilder` for a - case where this is used. - - :param tag: A `Tag` - :return: Whether or not a substitution was performed. - """ - return False - - def _replace_cdata_list_attribute_values(self, tag_name, attrs): - """When an attribute value is associated with a tag that can - have multiple values for that attribute, convert the string - value to a list of strings. - - Basically, replaces class="foo bar" with class=["foo", "bar"] - - NOTE: This method modifies its input in place. - - :param tag_name: The name of a tag. - :param attrs: A dictionary containing the tag's attributes. - Any appropriate attribute values will be modified in place. - """ - if not attrs: - return attrs - if self.cdata_list_attributes: - universal = self.cdata_list_attributes.get('*', []) - tag_specific = self.cdata_list_attributes.get( - tag_name.lower(), None) - for attr in list(attrs.keys()): - if attr in universal or (tag_specific and attr in tag_specific): - # We have a "class"-type attribute whose string - # value is a whitespace-separated list of - # values. Split it into a list. - value = attrs[attr] - if isinstance(value, str): - values = nonwhitespace_re.findall(value) - else: - # html5lib sometimes calls setAttributes twice - # for the same tag when rearranging the parse - # tree. On the second call the attribute value - # here is already a list. If this happens, - # leave the value alone rather than trying to - # split it again. - values = value - attrs[attr] = values - return attrs - -class SAXTreeBuilder(TreeBuilder): - """A Beautiful Soup treebuilder that listens for SAX events. - - This is not currently used for anything, but it demonstrates - how a simple TreeBuilder would work. - """ - - def feed(self, markup): - raise NotImplementedError() - - def close(self): - pass - - def startElement(self, name, attrs): - attrs = dict((key[1], value) for key, value in list(attrs.items())) - #print("Start %s, %r" % (name, attrs)) - self.soup.handle_starttag(name, attrs) - - def endElement(self, name): - #print("End %s" % name) - self.soup.handle_endtag(name) - - def startElementNS(self, nsTuple, nodeName, attrs): - # Throw away (ns, nodeName) for now. - self.startElement(nodeName, attrs) - - def endElementNS(self, nsTuple, nodeName): - # Throw away (ns, nodeName) for now. - self.endElement(nodeName) - #handler.endElementNS((ns, node.nodeName), node.nodeName) - - def startPrefixMapping(self, prefix, nodeValue): - # Ignore the prefix for now. - pass - - def endPrefixMapping(self, prefix): - # Ignore the prefix for now. - # handler.endPrefixMapping(prefix) - pass - - def characters(self, content): - self.soup.handle_data(content) - - def startDocument(self): - pass - - def endDocument(self): - pass - - -class HTMLTreeBuilder(TreeBuilder): - """This TreeBuilder knows facts about HTML. - - Such as which tags are empty-element tags. - """ - - empty_element_tags = set([ - # These are from HTML5. - 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', - - # These are from earlier versions of HTML and are removed in HTML5. - 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' - ]) - - # The HTML standard defines these as block-level elements. Beautiful - # Soup does not treat these elements differently from other elements, - # but it may do so eventually, and this information is available if - # you need to use it. - block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) - - # The HTML standard defines an unusual content model for these tags. - # We represent this by using a string class other than NavigableString - # inside these tags. - # - # I made this list by going through the HTML spec - # (https://html.spec.whatwg.org/#metadata-content) and looking for - # "metadata content" elements that can contain strings. - # - # TODO: Arguably

 in HTML
-        documents) should not.
-        """
-        return (
-            indent_level is not None
-            and (
-                not self.preserve_whitespace_tags
-                or self.name not in self.preserve_whitespace_tags
-            )
-        )
-
-    def prettify(self, encoding=None, formatter="minimal"):
-        """Pretty-print this PageElement as a string.
-
-        :param encoding: The eventual encoding of the string. If this is None,
-            a Unicode string will be returned.
-        :param formatter: A Formatter object, or a string naming one of
-            the standard formatters.
-        :return: A Unicode string (if encoding==None) or a bytestring 
-            (otherwise).
-        """
-        if encoding is None:
-            return self.decode(True, formatter=formatter)
-        else:
-            return self.encode(encoding, True, formatter=formatter)
-
-    def decode_contents(self, indent_level=None,
-                       eventual_encoding=DEFAULT_OUTPUT_ENCODING,
-                       formatter="minimal"):
-        """Renders the contents of this tag as a Unicode string.
-
-        :param indent_level: Each line of the rendering will be
-           indented this many spaces. Used internally in
-           recursive calls while pretty-printing.
-
-        :param eventual_encoding: The tag is destined to be
-           encoded into this encoding. decode_contents() is _not_
-           responsible for performing that encoding. This information
-           is passed in so that it can be substituted in if the
-           document contains a  tag that mentions the document's
-           encoding.
-
-        :param formatter: A Formatter object, or a string naming one of
-            the standard Formatters.
-        """
-        # First off, turn a string formatter into a Formatter object. This
-        # will stop the lookup from happening over and over again.
-        if not isinstance(formatter, Formatter):
-            formatter = self.formatter_for_name(formatter)
-
-        pretty_print = (indent_level is not None)
-        s = []
-        for c in self:
-            text = None
-            if isinstance(c, NavigableString):
-                text = c.output_ready(formatter)
-            elif isinstance(c, Tag):
-                s.append(c.decode(indent_level, eventual_encoding,
-                                  formatter))
-            preserve_whitespace = (
-                self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
-            )
-            if text and indent_level and not preserve_whitespace:
-                text = text.strip()
-            if text:
-                if pretty_print and not preserve_whitespace:
-                    s.append(" " * (indent_level - 1))
-                s.append(text)
-                if pretty_print and not preserve_whitespace:
-                    s.append("\n")
-        return ''.join(s)
-       
-    def encode_contents(
-        self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
-        formatter="minimal"):
-        """Renders the contents of this PageElement as a bytestring.
-
-        :param indent_level: Each line of the rendering will be
-           indented this many spaces. Used internally in
-           recursive calls while pretty-printing.
-
-        :param eventual_encoding: The bytestring will be in this encoding.
-
-        :param formatter: A Formatter object, or a string naming one of
-            the standard Formatters.
-
-        :return: A bytestring.
-        """
-        contents = self.decode_contents(indent_level, encoding, formatter)
-        return contents.encode(encoding)
-
-    # Old method for BS3 compatibility
-    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
-                       prettyPrint=False, indentLevel=0):
-        """Deprecated method for BS3 compatibility."""
-        if not prettyPrint:
-            indentLevel = None
-        return self.encode_contents(
-            indent_level=indentLevel, encoding=encoding)
-
-    #Soup methods
-
-    def find(self, name=None, attrs={}, recursive=True, text=None,
-             **kwargs):
-        """Look in the children of this PageElement and find the first
-        PageElement that matches the given criteria.
-
-        All find_* methods take a common set of arguments. See the online
-        documentation for detailed explanations.
-
-        :param name: A filter on tag name.
-        :param attrs: A dictionary of filters on attribute values.
-        :param recursive: If this is True, find() will perform a
-            recursive search of this PageElement's children. Otherwise,
-            only the direct children will be considered.
-        :param limit: Stop looking after finding this many results.
-        :kwargs: A dictionary of filters on attribute values.
-        :return: A PageElement.
-        :rtype: bs4.element.Tag | bs4.element.NavigableString
-        """
-        r = None
-        l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
-        if l:
-            r = l[0]
-        return r
-    findChild = find #BS2
-
-    def find_all(self, name=None, attrs={}, recursive=True, text=None,
-                 limit=None, **kwargs):
-        """Look in the children of this PageElement and find all
-        PageElements that match the given criteria.
-
-        All find_* methods take a common set of arguments. See the online
-        documentation for detailed explanations.
-
-        :param name: A filter on tag name.
-        :param attrs: A dictionary of filters on attribute values.
-        :param recursive: If this is True, find_all() will perform a
-            recursive search of this PageElement's children. Otherwise,
-            only the direct children will be considered.
-        :param limit: Stop looking after finding this many results.
-        :kwargs: A dictionary of filters on attribute values.
-        :return: A ResultSet of PageElements.
-        :rtype: bs4.element.ResultSet
-        """
-        generator = self.descendants
-        if not recursive:
-            generator = self.children
-        return self._find_all(name, attrs, text, limit, generator, **kwargs)
-    findAll = find_all       # BS3
-    findChildren = find_all  # BS2
-
-    #Generator methods
-    @property
-    def children(self):
-        """Iterate over all direct children of this PageElement.
-
-        :yield: A sequence of PageElements.
-        """
-        # return iter() to make the purpose of the method clear
-        return iter(self.contents)  # XXX This seems to be untested.
-
-    @property
-    def descendants(self):
-        """Iterate over all children of this PageElement in a
-        breadth-first sequence.
-
-        :yield: A sequence of PageElements.
-        """
-        if not len(self.contents):
-            return
-        stopNode = self._last_descendant().next_element
-        current = self.contents[0]
-        while current is not stopNode:
-            yield current
-            current = current.next_element
-
-    # CSS selector code
-    def select_one(self, selector, namespaces=None, **kwargs):
-        """Perform a CSS selection operation on the current element.
-
-        :param selector: A CSS selector.
-
-        :param namespaces: A dictionary mapping namespace prefixes
-           used in the CSS selector to namespace URIs. By default,
-           Beautiful Soup will use the prefixes it encountered while
-           parsing the document.
-
-        :param kwargs: Keyword arguments to be passed into SoupSieve's 
-           soupsieve.select() method.
-
-        :return: A Tag.
-        :rtype: bs4.element.Tag
-        """
-        value = self.select(selector, namespaces, 1, **kwargs)
-        if value:
-            return value[0]
-        return None
-
-    def select(self, selector, namespaces=None, limit=None, **kwargs):
-        """Perform a CSS selection operation on the current element.
-
-        This uses the SoupSieve library.
-
-        :param selector: A string containing a CSS selector.
-
-        :param namespaces: A dictionary mapping namespace prefixes
-           used in the CSS selector to namespace URIs. By default,
-           Beautiful Soup will use the prefixes it encountered while
-           parsing the document.
-
-        :param limit: After finding this number of results, stop looking.
-
-        :param kwargs: Keyword arguments to be passed into SoupSieve's 
-           soupsieve.select() method.
-
-        :return: A ResultSet of Tags.
-        :rtype: bs4.element.ResultSet
-        """
-        if namespaces is None:
-            namespaces = self._namespaces
-        
-        if limit is None:
-            limit = 0
-        if soupsieve is None:
-            raise NotImplementedError(
-                "Cannot execute CSS selectors because the soupsieve package is not installed."
-            )
-            
-        results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
-
-        # We do this because it's more consistent and because
-        # ResultSet.__getattr__ has a helpful error message.
-        return ResultSet(None, results)
-
-    # Old names for backwards compatibility
-    def childGenerator(self):
-        """Deprecated generator."""
-        return self.children
-
-    def recursiveChildGenerator(self):
-        """Deprecated generator."""
-        return self.descendants
-
-    def has_key(self, key):
-        """Deprecated method. This was kind of misleading because has_key()
-        (attributes) was different from __in__ (contents).
-
-        has_key() is gone in Python 3, anyway.
-        """
-        warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
-                key))
-        return self.has_attr(key)
-
-# Next, a couple classes to represent queries and their results.
-class SoupStrainer(object):
-    """Encapsulates a number of ways of matching a markup element (tag or
-    string).
-
-    This is primarily used to underpin the find_* methods, but you can
-    create one yourself and pass it in as `parse_only` to the
-    `BeautifulSoup` constructor, to parse a subset of a large
-    document.
-    """
-
-    def __init__(self, name=None, attrs={}, text=None, **kwargs):
-        """Constructor.
-
-        The SoupStrainer constructor takes the same arguments passed
-        into the find_* methods. See the online documentation for
-        detailed explanations.
-
-        :param name: A filter on tag name.
-        :param attrs: A dictionary of filters on attribute values.
-        :param text: A filter for a NavigableString with specific text.
-        :kwargs: A dictionary of filters on attribute values.
-        """        
-        self.name = self._normalize_search_value(name)
-        if not isinstance(attrs, dict):
-            # Treat a non-dict value for attrs as a search for the 'class'
-            # attribute.
-            kwargs['class'] = attrs
-            attrs = None
-
-        if 'class_' in kwargs:
-            # Treat class_="foo" as a search for the 'class'
-            # attribute, overriding any non-dict value for attrs.
-            kwargs['class'] = kwargs['class_']
-            del kwargs['class_']
-
-        if kwargs:
-            if attrs:
-                attrs = attrs.copy()
-                attrs.update(kwargs)
-            else:
-                attrs = kwargs
-        normalized_attrs = {}
-        for key, value in list(attrs.items()):
-            normalized_attrs[key] = self._normalize_search_value(value)
-
-        self.attrs = normalized_attrs
-        self.text = self._normalize_search_value(text)
-
-    def _normalize_search_value(self, value):
-        # Leave it alone if it's a Unicode string, a callable, a
-        # regular expression, a boolean, or None.
-        if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
-            or isinstance(value, bool) or value is None):
-            return value
-
-        # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
-        if isinstance(value, bytes):
-            return value.decode("utf8")
-
-        # If it's listlike, convert it into a list of strings.
-        if hasattr(value, '__iter__'):
-            new_value = []
-            for v in value:
-                if (hasattr(v, '__iter__') and not isinstance(v, bytes)
-                    and not isinstance(v, str)):
-                    # This is almost certainly the user's mistake. In the
-                    # interests of avoiding infinite loops, we'll let
-                    # it through as-is rather than doing a recursive call.
-                    new_value.append(v)
-                else:
-                    new_value.append(self._normalize_search_value(v))
-            return new_value
-
-        # Otherwise, convert it into a Unicode string.
-        # The unicode(str()) thing is so this will do the same thing on Python 2
-        # and Python 3.
-        return str(str(value))
-
-    def __str__(self):
-        """A human-readable representation of this SoupStrainer."""
-        if self.text:
-            return self.text
-        else:
-            return "%s|%s" % (self.name, self.attrs)
-
-    def search_tag(self, markup_name=None, markup_attrs={}):
-        """Check whether a Tag with the given name and attributes would
-        match this SoupStrainer.
-
-        Used prospectively to decide whether to even bother creating a Tag
-        object.
-
-        :param markup_name: A tag name as found in some markup.
-        :param markup_attrs: A dictionary of attributes as found in some markup.
-
-        :return: True if the prospective tag would match this SoupStrainer;
-            False otherwise.
-        """
-        found = None
-        markup = None
-        if isinstance(markup_name, Tag):
-            markup = markup_name
-            markup_attrs = markup
-
-        if isinstance(self.name, str):
-            # Optimization for a very common case where the user is
-            # searching for a tag with one specific name, and we're
-            # looking at a tag with a different name.
-            if markup and not markup.prefix and self.name != markup.name:
-                 return False
-            
-        call_function_with_tag_data = (
-            isinstance(self.name, Callable)
-            and not isinstance(markup_name, Tag))
-
-        if ((not self.name)
-            or call_function_with_tag_data
-            or (markup and self._matches(markup, self.name))
-            or (not markup and self._matches(markup_name, self.name))):
-            if call_function_with_tag_data:
-                match = self.name(markup_name, markup_attrs)
-            else:
-                match = True
-                markup_attr_map = None
-                for attr, match_against in list(self.attrs.items()):
-                    if not markup_attr_map:
-                        if hasattr(markup_attrs, 'get'):
-                            markup_attr_map = markup_attrs
-                        else:
-                            markup_attr_map = {}
-                            for k, v in markup_attrs:
-                                markup_attr_map[k] = v
-                    attr_value = markup_attr_map.get(attr)
-                    if not self._matches(attr_value, match_against):
-                        match = False
-                        break
-            if match:
-                if markup:
-                    found = markup
-                else:
-                    found = markup_name
-        if found and self.text and not self._matches(found.string, self.text):
-            found = None
-        return found
-
-    # For BS3 compatibility.
-    searchTag = search_tag
-
-    def search(self, markup):
-        """Find all items in `markup` that match this SoupStrainer.
-
-        Used by the core _find_all() method, which is ultimately
-        called by all find_* methods.
-
-        :param markup: A PageElement or a list of them.
-        """
-        # print('looking for %s in %s' % (self, markup))
-        found = None
-        # If given a list of items, scan it for a text element that
-        # matches.
-        if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
-            for element in markup:
-                if isinstance(element, NavigableString) \
-                       and self.search(element):
-                    found = element
-                    break
-        # If it's a Tag, make sure its name or attributes match.
-        # Don't bother with Tags if we're searching for text.
-        elif isinstance(markup, Tag):
-            if not self.text or self.name or self.attrs:
-                found = self.search_tag(markup)
-        # If it's text, make sure the text matches.
-        elif isinstance(markup, NavigableString) or \
-                 isinstance(markup, str):
-            if not self.name and not self.attrs and self._matches(markup, self.text):
-                found = markup
-        else:
-            raise Exception(
-                "I don't know how to match against a %s" % markup.__class__)
-        return found
-
-    def _matches(self, markup, match_against, already_tried=None):
-        # print(u"Matching %s against %s" % (markup, match_against))
-        result = False
-        if isinstance(markup, list) or isinstance(markup, tuple):
-            # This should only happen when searching a multi-valued attribute
-            # like 'class'.
-            for item in markup:
-                if self._matches(item, match_against):
-                    return True
-            # We didn't match any particular value of the multivalue
-            # attribute, but maybe we match the attribute value when
-            # considered as a string.
-            if self._matches(' '.join(markup), match_against):
-                return True
-            return False
-        
-        if match_against is True:
-            # True matches any non-None value.
-            return markup is not None
-
-        if isinstance(match_against, Callable):
-            return match_against(markup)
-
-        # Custom callables take the tag as an argument, but all
-        # other ways of matching match the tag name as a string.
-        original_markup = markup
-        if isinstance(markup, Tag):
-            markup = markup.name
-
-        # Ensure that `markup` is either a Unicode string, or None.
-        markup = self._normalize_search_value(markup)
-
-        if markup is None:
-            # None matches None, False, an empty string, an empty list, and so on.
-            return not match_against
-
-        if (hasattr(match_against, '__iter__')
-            and not isinstance(match_against, str)):
-            # We're asked to match against an iterable of items.
-            # The markup must be match at least one item in the
-            # iterable. We'll try each one in turn.
-            #
-            # To avoid infinite recursion we need to keep track of
-            # items we've already seen.
-            if not already_tried:
-                already_tried = set()
-            for item in match_against:
-                if item.__hash__:
-                    key = item
-                else:
-                    key = id(item)
-                if key in already_tried:
-                    continue
-                else:
-                    already_tried.add(key)
-                    if self._matches(original_markup, item, already_tried):
-                        return True
-            else:
-                return False
-        
-        # Beyond this point we might need to run the test twice: once against
-        # the tag's name and once against its prefixed name.
-        match = False
-        
-        if not match and isinstance(match_against, str):
-            # Exact string match
-            match = markup == match_against
-
-        if not match and hasattr(match_against, 'search'):
-            # Regexp match
-            return match_against.search(markup)
-
-        if (not match
-            and isinstance(original_markup, Tag)
-            and original_markup.prefix):
-            # Try the whole thing again with the prefixed tag name.
-            return self._matches(
-                original_markup.prefix + ':' + original_markup.name, match_against
-            )
-
-        return match
-
-
-class ResultSet(list):
-    """A ResultSet is just a list that keeps track of the SoupStrainer
-    that created it."""
-    def __init__(self, source, result=()):
-        """Constructor.
-
-        :param source: A SoupStrainer.
-        :param result: A list of PageElements.
-        """
-        super(ResultSet, self).__init__(result)
-        self.source = source
-
-    def __getattr__(self, key):
-        """Raise a helpful exception to explain a common code fix."""
-        raise AttributeError(
-            "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
-        )
diff -Nru beautifulsoup4-4.10.0/bs4/formatter.py beautifulsoup4-1.7.1-benchmark/bs4/formatter.py
--- beautifulsoup4-4.10.0/bs4/formatter.py	2021-09-07 23:36:46.000000000 +0000
+++ beautifulsoup4-1.7.1-benchmark/bs4/formatter.py	1970-01-01 00:00:00.000000000 +0000
@@ -1,165 +0,0 @@
-from bs4.dammit import EntitySubstitution
-
-class Formatter(EntitySubstitution):
-    """Describes a strategy to use when outputting a parse tree to a string.
-
-    Some parts of this strategy come from the distinction between
-    HTML4, HTML5, and XML. Others are configurable by the user.
-
-    Formatters are passed in as the `formatter` argument to methods
-    like `PageElement.encode`. Most people won't need to think about
-    formatters, and most people who need to think about them can pass
-    in one of these predefined strings as `formatter` rather than
-    making a new Formatter object:
-
-    For HTML documents:
-     * 'html' - HTML entity substitution for generic HTML documents. (default)
-     * 'html5' - HTML entity substitution for HTML5 documents, as
-                 well as some optimizations in the way tags are rendered.
-     * 'minimal' - Only make the substitutions necessary to guarantee
-                   valid HTML.
-     * None - Do not perform any substitution. This will be faster
-              but may result in invalid markup.
-
-    For XML documents:
-     * 'html' - Entity substitution for XHTML documents.
-     * 'minimal' - Only make the substitutions necessary to guarantee
-                   valid XML. (default)
-     * None - Do not perform any substitution. This will be faster
-              but may result in invalid markup.
-    """
-    # Registries of XML and HTML formatters.
-    XML_FORMATTERS = {}
-    HTML_FORMATTERS = {}
-
-    HTML = 'html'
-    XML = 'xml'
-
-    HTML_DEFAULTS = dict(
-        cdata_containing_tags=set(["script", "style"]),
-    )
-
-    def _default(self, language, value, kwarg):
-        if value is not None:
-            return value
-        if language == self.XML:
-            return set()
-        return self.HTML_DEFAULTS[kwarg]
-
-    def __init__(
-            self, language=None, entity_substitution=None,
-            void_element_close_prefix='/', cdata_containing_tags=None,
-            empty_attributes_are_booleans=False,
-    ):
-        """Constructor.
-
-        :param language: This should be Formatter.XML if you are formatting
-           XML markup and Formatter.HTML if you are formatting HTML markup.
-
-        :param entity_substitution: A function to call to replace special
-           characters with XML/HTML entities. For examples, see 
-           bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
-        :param void_element_close_prefix: By default, void elements
-           are represented as  (XML rules) rather than 
-           (HTML rules). To get , pass in the empty string.
-        :param cdata_containing_tags: The list of tags that are defined
-           as containing CDATA in this dialect. For example, in HTML,
-           
-
This numeric entity is missing the final semicolon:
- -
a
-
This document contains (do you see it?)
-
This document ends with That attribute value was bogus
-The doctype is invalid because it contains extra whitespace -
That boolean attribute had no value
-
Here's a nonexistent entity: &#foo; (do you see it?)
-
This document ends before the entity finishes: > -

Paragraphs shouldn't contain block display elements, but this one does:

you see?

-Multiple values for the same attribute. -
Here's a table
-
-
This tag contains nothing but whitespace:
-

This p tag is cut off by

the end of the blockquote tag
-
Here's a nested table:
foo
This table contains bare markup
- -
This document contains a surprise doctype
- -
Tag name contains Unicode characters
- - -""" - - -class SoupTest(unittest.TestCase): - - @property - def default_builder(self): - return default_builder - - def soup(self, markup, **kwargs): - """Build a Beautiful Soup object from markup.""" - builder = kwargs.pop('builder', self.default_builder) - return BeautifulSoup(markup, builder=builder, **kwargs) - - def document_for(self, markup, **kwargs): - """Turn an HTML fragment into a document. - - The details depend on the builder. - """ - return self.default_builder(**kwargs).test_fragment_to_document(markup) - - def assertSoupEquals(self, to_parse, compare_parsed_to=None): - builder = self.default_builder - obj = BeautifulSoup(to_parse, builder=builder) - if compare_parsed_to is None: - compare_parsed_to = to_parse - - # Verify that the documents come out the same. - self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) - - # Also run some checks on the BeautifulSoup object itself: - - # Verify that every tag that was opened was eventually closed. - - # There are no tags in the open tag counter. - assert all(v==0 for v in list(obj.open_tag_counter.values())) - - # The only tag in the tag stack is the one for the root - # document. - self.assertEqual( - [obj.ROOT_TAG_NAME], [x.name for x in obj.tagStack] - ) - - def assertConnectedness(self, element): - """Ensure that next_element and previous_element are properly - set for all descendants of the given element. - """ - earlier = None - for e in element.descendants: - if earlier: - self.assertEqual(e, earlier.next_element) - self.assertEqual(earlier, e.previous_element) - earlier = e - - def linkage_validator(self, el, _recursive_call=False): - """Ensure proper linkage throughout the document.""" - descendant = None - # Document element should have no previous element or previous sibling. - # It also shouldn't have a next sibling. - if el.parent is None: - assert el.previous_element is None,\ - "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( - el, el.previous_element, None - ) - assert el.previous_sibling is None,\ - "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( - el, el.previous_sibling, None - ) - assert el.next_sibling is None,\ - "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( - el, el.next_sibling, None - ) - - idx = 0 - child = None - last_child = None - last_idx = len(el.contents) - 1 - for child in el.contents: - descendant = None - - # Parent should link next element to their first child - # That child should have no previous sibling - if idx == 0: - if el.parent is not None: - assert el.next_element is child,\ - "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( - el, el.next_element, child - ) - assert child.previous_element is el,\ - "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( - child, child.previous_element, el - ) - assert child.previous_sibling is None,\ - "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format( - child, child.previous_sibling, None - ) - - # If not the first child, previous index should link as sibling to this index - # Previous element should match the last index or the last bubbled up descendant - else: - assert child.previous_sibling is el.contents[idx - 1],\ - "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format( - child, child.previous_sibling, el.contents[idx - 1] - ) - assert el.contents[idx - 1].next_sibling is child,\ - "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( - el.contents[idx - 1], el.contents[idx - 1].next_sibling, child - ) - - if last_child is not None: - assert child.previous_element is last_child,\ - "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format( - child, child.previous_element, last_child, child.parent.contents - ) - assert last_child.next_element is child,\ - "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( - last_child, last_child.next_element, child - ) - - if isinstance(child, Tag) and child.contents: - descendant = self.linkage_validator(child, True) - # A bubbled up descendant should have no next siblings - assert descendant.next_sibling is None,\ - "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( - descendant, descendant.next_sibling, None - ) - - # Mark last child as either the bubbled up descendant or the current child - if descendant is not None: - last_child = descendant - else: - last_child = child - - # If last child, there are non next siblings - if idx == last_idx: - assert child.next_sibling is None,\ - "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( - child, child.next_sibling, None - ) - idx += 1 - - child = descendant if descendant is not None else child - if child is None: - child = el - - if not _recursive_call and child is not None: - target = el - while True: - if target is None: - assert child.next_element is None, \ - "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( - child, child.next_element, None - ) - break - elif target.next_sibling is not None: - assert child.next_element is target.next_sibling, \ - "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( - child, child.next_element, target.next_sibling - ) - break - target = target.parent - - # We are done, so nothing to return - return None - else: - # Return the child to the recursive caller - return child - - -class TreeBuilderSmokeTest(object): - # Tests that are common to HTML and XML tree builders. - - def test_fuzzed_input(self): - # This test centralizes in one place the various fuzz tests - # for Beautiful Soup created by the oss-fuzz project. - - # These strings superficially resemble markup, but they - # generally can't be parsed into anything. The best we can - # hope for is that parsing these strings won't crash the - # parser. - # - # n.b. This markup is commented out because these fuzz tests - # _do_ crash the parser. However the crashes are due to bugs - # in html.parser, not Beautiful Soup -- otherwise I'd fix the - # bugs! - - bad_markup = [ - # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873 - # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700 - # https://bugs.python.org/issue37747 - # - #b'\nSome CSS" - ) - assert isinstance(soup.style.string, Stylesheet) - assert isinstance(soup.script.string, Script) - - soup = self.soup( - "" - ) - assert isinstance(soup.style.string, Stylesheet) - # The contents of the style tag resemble an HTML comment, but - # it's not treated as a comment. - self.assertEqual("", soup.style.string) - assert isinstance(soup.style.string, Stylesheet) - - def test_pickle_and_unpickle_identity(self): - # Pickling a tree, then unpickling it, yields a tree identical - # to the original. - tree = self.soup("foo") - dumped = pickle.dumps(tree, 2) - loaded = pickle.loads(dumped) - self.assertEqual(loaded.__class__, BeautifulSoup) - self.assertEqual(loaded.decode(), tree.decode()) - - def assertDoctypeHandled(self, doctype_fragment): - """Assert that a given doctype string is handled correctly.""" - doctype_str, soup = self._document_with_doctype(doctype_fragment) - - # Make sure a Doctype object was created. - doctype = soup.contents[0] - self.assertEqual(doctype.__class__, Doctype) - self.assertEqual(doctype, doctype_fragment) - self.assertEqual( - soup.encode("utf8")[:len(doctype_str)], - doctype_str - ) - - # Make sure that the doctype was correctly associated with the - # parse tree and that the rest of the document parsed. - self.assertEqual(soup.p.contents[0], 'foo') - - def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"): - """Generate and parse a document with the given doctype.""" - doctype = '' % (doctype_string, doctype_fragment) - markup = doctype + '\n

foo

' - soup = self.soup(markup) - return doctype.encode("utf8"), soup - - def test_normal_doctypes(self): - """Make sure normal, everyday HTML doctypes are handled correctly.""" - self.assertDoctypeHandled("html") - self.assertDoctypeHandled( - 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') - - def test_empty_doctype(self): - soup = self.soup("") - doctype = soup.contents[0] - self.assertEqual("", doctype.strip()) - - def test_mixed_case_doctype(self): - # A lowercase or mixed-case doctype becomes a Doctype. - for doctype_fragment in ("doctype", "DocType"): - doctype_str, soup = self._document_with_doctype( - "html", doctype_fragment - ) - - # Make sure a Doctype object was created and that the DOCTYPE - # is uppercase. - doctype = soup.contents[0] - self.assertEqual(doctype.__class__, Doctype) - self.assertEqual(doctype, "html") - self.assertEqual( - soup.encode("utf8")[:len(doctype_str)], - b"" - ) - - # Make sure that the doctype was correctly associated with the - # parse tree and that the rest of the document parsed. - self.assertEqual(soup.p.contents[0], 'foo') - - def test_public_doctype_with_url(self): - doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' - self.assertDoctypeHandled(doctype) - - def test_system_doctype(self): - self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') - - def test_namespaced_system_doctype(self): - # We can handle a namespaced doctype with a system ID. - self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') - - def test_namespaced_public_doctype(self): - # Test a namespaced doctype with a public id. - self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') - - def test_real_xhtml_document(self): - """A real XHTML document should come out more or less the same as it went in.""" - markup = b""" - - -Hello. -Goodbye. -""" - soup = self.soup(markup) - self.assertEqual( - soup.encode("utf-8").replace(b"\n", b""), - markup.replace(b"\n", b"")) - - def test_namespaced_html(self): - """When a namespaced XML document is parsed as HTML it should - be treated as HTML with weird tag names. - """ - markup = b"""content""" - soup = self.soup(markup) - self.assertEqual(2, len(soup.find_all("ns1:foo"))) - - def test_processing_instruction(self): - # We test both Unicode and bytestring to verify that - # process_markup correctly sets processing_instruction_class - # even when the markup is already Unicode and there is no - # need to process anything. - markup = """""" - soup = self.soup(markup) - self.assertEqual(markup, soup.decode()) - - markup = b"""""" - soup = self.soup(markup) - self.assertEqual(markup, soup.encode("utf8")) - - def test_deepcopy(self): - """Make sure you can copy the tree builder. - - This is important because the builder is part of a - BeautifulSoup object, and we want to be able to copy that. - """ - copy.deepcopy(self.default_builder) - - def test_p_tag_is_never_empty_element(self): - """A

tag is never designated as an empty-element tag. - - Even if the markup shows it as an empty-element tag, it - shouldn't be presented that way. - """ - soup = self.soup("

") - self.assertFalse(soup.p.is_empty_element) - self.assertEqual(str(soup.p), "

") - - def test_unclosed_tags_get_closed(self): - """A tag that's not closed by the end of the document should be closed. - - This applies to all tags except empty-element tags. - """ - self.assertSoupEquals("

", "

") - self.assertSoupEquals("", "") - - self.assertSoupEquals("
", "
") - - def test_br_is_always_empty_element_tag(self): - """A
tag is designated as an empty-element tag. - - Some parsers treat

as one
tag, some parsers as - two tags, but it should always be an empty-element tag. - """ - soup = self.soup("

") - self.assertTrue(soup.br.is_empty_element) - self.assertEqual(str(soup.br), "
") - - def test_nested_formatting_elements(self): - self.assertSoupEquals("") - - def test_double_head(self): - html = ''' - - -Ordinary HEAD element test - - - -Hello, world! - - -''' - soup = self.soup(html) - self.assertEqual("text/javascript", soup.find('script')['type']) - - def test_comment(self): - # Comments are represented as Comment objects. - markup = "

foobaz

" - self.assertSoupEquals(markup) - - soup = self.soup(markup) - comment = soup.find(text="foobar") - self.assertEqual(comment.__class__, Comment) - - # The comment is properly integrated into the tree. - foo = soup.find(text="foo") - self.assertEqual(comment, foo.next_element) - baz = soup.find(text="baz") - self.assertEqual(comment, baz.previous_element) - - def test_preserved_whitespace_in_pre_and_textarea(self): - """Whitespace must be preserved in
 and "
-        self.assertSoupEquals(pre_markup)
-        self.assertSoupEquals(textarea_markup)
-
-        soup = self.soup(pre_markup)
-        self.assertEqual(soup.pre.prettify(), pre_markup)
-
-        soup = self.soup(textarea_markup)
-        self.assertEqual(soup.textarea.prettify(), textarea_markup)
-
-        soup = self.soup("")
-        self.assertEqual(soup.textarea.prettify(), "")
-
-    def test_nested_inline_elements(self):
-        """Inline elements can be nested indefinitely."""
-        b_tag = "Inside a B tag"
-        self.assertSoupEquals(b_tag)
-
-        nested_b_tag = "

A nested tag

" - self.assertSoupEquals(nested_b_tag) - - double_nested_b_tag = "

A doubly nested tag

" - self.assertSoupEquals(nested_b_tag) - - def test_nested_block_level_elements(self): - """Block elements can be nested.""" - soup = self.soup('

Foo

') - blockquote = soup.blockquote - self.assertEqual(blockquote.p.b.string, 'Foo') - self.assertEqual(blockquote.b.string, 'Foo') - - def test_correctly_nested_tables(self): - """One table can go inside another one.""" - markup = ('' - '' - "') - - self.assertSoupEquals( - markup, - '
Here's another table:" - '' - '' - '
foo
Here\'s another table:' - '
foo
' - '
') - - self.assertSoupEquals( - "" - "" - "
Foo
Bar
Baz
") - - def test_multivalued_attribute_with_whitespace(self): - # Whitespace separating the values of a multi-valued attribute - # should be ignored. - - markup = '
' - soup = self.soup(markup) - self.assertEqual(['foo', 'bar'], soup.div['class']) - - # If you search by the literal name of the class it's like the whitespace - # wasn't there. - self.assertEqual(soup.div, soup.find('div', class_="foo bar")) - - def test_deeply_nested_multivalued_attribute(self): - # html5lib can set the attributes of the same tag many times - # as it rearranges the tree. This has caused problems with - # multivalued attributes. - markup = '
' - soup = self.soup(markup) - self.assertEqual(["css"], soup.div.div['class']) - - def test_multivalued_attribute_on_html(self): - # html5lib uses a different API to set the attributes ot the - # tag. This has caused problems with multivalued - # attributes. - markup = '' - soup = self.soup(markup) - self.assertEqual(["a", "b"], soup.html['class']) - - def test_angle_brackets_in_attribute_values_are_escaped(self): - self.assertSoupEquals('', '') - - def test_strings_resembling_character_entity_references(self): - # "&T" and "&p" look like incomplete character entities, but they are - # not. - self.assertSoupEquals( - "

• AT&T is in the s&p 500

", - "

\u2022 AT&T is in the s&p 500

" - ) - - def test_apos_entity(self): - self.assertSoupEquals( - "

Bob's Bar

", - "

Bob's Bar

", - ) - - def test_entities_in_foreign_document_encoding(self): - # “ and ” are invalid numeric entities referencing - # Windows-1252 characters. - references a character common - # to Windows-1252 and Unicode, and ☃ references a - # character only found in Unicode. - # - # All of these entities should be converted to Unicode - # characters. - markup = "

“Hello” -☃

" - soup = self.soup(markup) - self.assertEqual("“Hello” -☃", soup.p.string) - - def test_entities_in_attributes_converted_to_unicode(self): - expect = '

' - self.assertSoupEquals('

', expect) - self.assertSoupEquals('

', expect) - self.assertSoupEquals('

', expect) - self.assertSoupEquals('

', expect) - - def test_entities_in_text_converted_to_unicode(self): - expect = '

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' - self.assertSoupEquals("

piñata

", expect) - self.assertSoupEquals("

piñata

", expect) - self.assertSoupEquals("

piñata

", expect) - self.assertSoupEquals("

piñata

", expect) - - def test_quot_entity_converted_to_quotation_mark(self): - self.assertSoupEquals("

I said "good day!"

", - '

I said "good day!"

') - - def test_out_of_range_entity(self): - expect = "\N{REPLACEMENT CHARACTER}" - self.assertSoupEquals("�", expect) - self.assertSoupEquals("�", expect) - self.assertSoupEquals("�", expect) - - def test_multipart_strings(self): - "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." - soup = self.soup("

\nfoo

") - self.assertEqual("p", soup.h2.string.next_element.name) - self.assertEqual("p", soup.p.name) - self.assertConnectedness(soup) - - def test_empty_element_tags(self): - """Verify consistent handling of empty-element tags, - no matter how they come in through the markup. - """ - self.assertSoupEquals('


', "


") - self.assertSoupEquals('


', "


") - - def test_head_tag_between_head_and_body(self): - "Prevent recurrence of a bug in the html5lib treebuilder." - content = """ - - foo - -""" - soup = self.soup(content) - self.assertNotEqual(None, soup.html.body) - self.assertConnectedness(soup) - - def test_multiple_copies_of_a_tag(self): - "Prevent recurrence of a bug in the html5lib treebuilder." - content = """ - - - - - -""" - soup = self.soup(content) - self.assertConnectedness(soup.article) - - def test_basic_namespaces(self): - """Parsers don't need to *understand* namespaces, but at the - very least they should not choke on namespaces or lose - data.""" - - markup = b'4' - soup = self.soup(markup) - self.assertEqual(markup, soup.encode()) - html = soup.html - self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) - self.assertEqual( - 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) - self.assertEqual( - 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) - - def test_multivalued_attribute_value_becomes_list(self): - markup = b'' - soup = self.soup(markup) - self.assertEqual(['foo', 'bar'], soup.a['class']) - - # - # Generally speaking, tests below this point are more tests of - # Beautiful Soup than tests of the tree builders. But parsers are - # weird, so we run these tests separately for every tree builder - # to detect any differences between them. - # - - def test_can_parse_unicode_document(self): - # A seemingly innocuous document... but it's in Unicode! And - # it contains characters that can't be represented in the - # encoding found in the declaration! The horror! - markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' - soup = self.soup(markup) - self.assertEqual('Sacr\xe9 bleu!', soup.body.string) - - def test_soupstrainer(self): - """Parsers should be able to work with SoupStrainers.""" - strainer = SoupStrainer("b") - soup = self.soup("A bold statement", - parse_only=strainer) - self.assertEqual(soup.decode(), "bold") - - def test_single_quote_attribute_values_become_double_quotes(self): - self.assertSoupEquals("", - '') - - def test_attribute_values_with_nested_quotes_are_left_alone(self): - text = """a""" - self.assertSoupEquals(text) - - def test_attribute_values_with_double_nested_quotes_get_quoted(self): - text = """a""" - soup = self.soup(text) - soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' - self.assertSoupEquals( - soup.foo.decode(), - """a""") - - def test_ampersand_in_attribute_value_gets_escaped(self): - self.assertSoupEquals('', - '') - - self.assertSoupEquals( - 'foo', - 'foo') - - def test_escaped_ampersand_in_attribute_value_is_left_alone(self): - self.assertSoupEquals('') - - def test_entities_in_strings_converted_during_parsing(self): - # Both XML and HTML entities are converted to Unicode characters - # during parsing. - text = "

<<sacré bleu!>>

" - expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" - self.assertSoupEquals(text, expected) - - def test_smart_quotes_converted_on_the_way_in(self): - # Microsoft smart quotes are converted to Unicode characters during - # parsing. - quote = b"

\x91Foo\x92

" - soup = self.soup(quote) - self.assertEqual( - soup.p.string, - "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") - - def test_non_breaking_spaces_converted_on_the_way_in(self): - soup = self.soup("  ") - self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) - - def test_entities_converted_on_the_way_out(self): - text = "

<<sacré bleu!>>

" - expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") - soup = self.soup(text) - self.assertEqual(soup.p.encode("utf-8"), expected) - - def test_real_iso_latin_document(self): - # Smoke test of interrelated functionality, using an - # easy-to-understand document. - - # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. - unicode_html = '

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' - - # That's because we're going to encode it into ISO-Latin-1, and use - # that to test. - iso_latin_html = unicode_html.encode("iso-8859-1") - - # Parse the ISO-Latin-1 HTML. - soup = self.soup(iso_latin_html) - # Encode it to UTF-8. - result = soup.encode("utf-8") - - # What do we expect the result to look like? Well, it would - # look like unicode_html, except that the META tag would say - # UTF-8 instead of ISO-Latin-1. - expected = unicode_html.replace("ISO-Latin-1", "utf-8") - - # And, of course, it would be in UTF-8, not Unicode. - expected = expected.encode("utf-8") - - # Ta-da! - self.assertEqual(result, expected) - - def test_real_shift_jis_document(self): - # Smoke test to make sure the parser can handle a document in - # Shift-JIS encoding, without choking. - shift_jis_html = ( - b'
'
-            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
-            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
-            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
-            b'
') - unicode_html = shift_jis_html.decode("shift-jis") - soup = self.soup(unicode_html) - - # Make sure the parse tree is correctly encoded to various - # encodings. - self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) - self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) - - def test_real_hebrew_document(self): - # A real-world test to make sure we can convert ISO-8859-9 (a - # Hebrew encoding) to UTF-8. - hebrew_document = b'Hebrew (ISO 8859-8) in Visual Directionality

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' - soup = self.soup( - hebrew_document, from_encoding="iso8859-8") - # Some tree builders call it iso8859-8, others call it iso-8859-9. - # That's not a difference we really care about. - assert soup.original_encoding in ('iso8859-8', 'iso-8859-8') - self.assertEqual( - soup.encode('utf-8'), - hebrew_document.decode("iso8859-8").encode("utf-8")) - - def test_meta_tag_reflects_current_encoding(self): - # Here's the tag saying that a document is - # encoded in Shift-JIS. - meta_tag = ('') - - # Here's a document incorporating that meta tag. - shift_jis_html = ( - '\n%s\n' - '' - 'Shift-JIS markup goes here.') % meta_tag - soup = self.soup(shift_jis_html) - - # Parse the document, and the charset is seemingly unaffected. - parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) - content = parsed_meta['content'] - self.assertEqual('text/html; charset=x-sjis', content) - - # But that value is actually a ContentMetaAttributeValue object. - self.assertTrue(isinstance(content, ContentMetaAttributeValue)) - - # And it will take on a value that reflects its current - # encoding. - self.assertEqual('text/html; charset=utf8', content.encode("utf8")) - - # For the rest of the story, see TestSubstitutions in - # test_tree.py. - - def test_html5_style_meta_tag_reflects_current_encoding(self): - # Here's the tag saying that a document is - # encoded in Shift-JIS. - meta_tag = ('') - - # Here's a document incorporating that meta tag. - shift_jis_html = ( - '\n%s\n' - '' - 'Shift-JIS markup goes here.') % meta_tag - soup = self.soup(shift_jis_html) - - # Parse the document, and the charset is seemingly unaffected. - parsed_meta = soup.find('meta', id="encoding") - charset = parsed_meta['charset'] - self.assertEqual('x-sjis', charset) - - # But that value is actually a CharsetMetaAttributeValue object. - self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) - - # And it will take on a value that reflects its current - # encoding. - self.assertEqual('utf8', charset.encode("utf8")) - - def test_python_specific_encodings_not_used_in_charset(self): - # You can encode an HTML document using a Python-specific - # encoding, but that encoding won't be mentioned _inside_ the - # resulting document. Instead, the document will appear to - # have no encoding. - for markup in [ - b'' - b'' - ]: - soup = self.soup(markup) - for encoding in PYTHON_SPECIFIC_ENCODINGS: - if encoding in ( - 'idna', 'mbcs', 'oem', 'undefined', - 'string_escape', 'string-escape' - ): - # For one reason or another, these will raise an - # exception if we actually try to use them, so don't - # bother. - continue - encoded = soup.encode(encoding) - assert b'meta charset=""' in encoded - assert encoding.encode("ascii") not in encoded - - def test_tag_with_no_attributes_can_have_attributes_added(self): - data = self.soup("text") - data.a['foo'] = 'bar' - self.assertEqual('text', data.a.decode()) - - def test_closing_tag_with_no_opening_tag(self): - # Without BeautifulSoup.open_tag_counter, the tag will - # cause _popToTag to be called over and over again as we look - # for a tag that wasn't there. The result is that 'text2' - # will show up outside the body of the document. - soup = self.soup("

text1

text2
") - self.assertEqual( - "

text1

text2
", soup.body.decode() - ) - - def test_worst_case(self): - """Test the worst case (currently) for linking issues.""" - - soup = self.soup(BAD_DOCUMENT) - self.linkage_validator(soup) - - -class XMLTreeBuilderSmokeTest(TreeBuilderSmokeTest): - - def test_pickle_and_unpickle_identity(self): - # Pickling a tree, then unpickling it, yields a tree identical - # to the original. - tree = self.soup("foo") - dumped = pickle.dumps(tree, 2) - loaded = pickle.loads(dumped) - self.assertEqual(loaded.__class__, BeautifulSoup) - self.assertEqual(loaded.decode(), tree.decode()) - - def test_docstring_generated(self): - soup = self.soup("") - self.assertEqual( - soup.encode(), b'\n') - - def test_xml_declaration(self): - markup = b"""\n""" - soup = self.soup(markup) - self.assertEqual(markup, soup.encode("utf8")) - - def test_python_specific_encodings_not_used_in_xml_declaration(self): - # You can encode an XML document using a Python-specific - # encoding, but that encoding won't be mentioned _inside_ the - # resulting document. - markup = b"""\n""" - soup = self.soup(markup) - for encoding in PYTHON_SPECIFIC_ENCODINGS: - if encoding in ( - 'idna', 'mbcs', 'oem', 'undefined', - 'string_escape', 'string-escape' - ): - # For one reason or another, these will raise an - # exception if we actually try to use them, so don't - # bother. - continue - encoded = soup.encode(encoding) - assert b'' in encoded - assert encoding.encode("ascii") not in encoded - - def test_processing_instruction(self): - markup = b"""\n""" - soup = self.soup(markup) - self.assertEqual(markup, soup.encode("utf8")) - - def test_real_xhtml_document(self): - """A real XHTML document should come out *exactly* the same as it went in.""" - markup = b""" - - -Hello. -Goodbye. -""" - soup = self.soup(markup) - self.assertEqual( - soup.encode("utf-8"), markup) - - def test_nested_namespaces(self): - doc = b""" - - - - - -""" - soup = self.soup(doc) - self.assertEqual(doc, soup.encode()) - - def test_formatter_processes_script_tag_for_xml_documents(self): - doc = """ - -""" - soup = BeautifulSoup(doc, "lxml-xml") - # lxml would have stripped this while parsing, but we can add - # it later. - soup.script.string = 'console.log("< < hey > > ");' - encoded = soup.encode() - self.assertTrue(b"< < hey > >" in encoded) - - def test_can_parse_unicode_document(self): - markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' - soup = self.soup(markup) - self.assertEqual('Sacr\xe9 bleu!', soup.root.string) - - def test_popping_namespaced_tag(self): - markup = 'b2012-07-02T20:33:42Zcd' - soup = self.soup(markup) - self.assertEqual( - str(soup.rss), markup) - - def test_docstring_includes_correct_encoding(self): - soup = self.soup("") - self.assertEqual( - soup.encode("latin1"), - b'\n') - - def test_large_xml_document(self): - """A large XML document should come out the same as it went in.""" - markup = (b'\n' - + b'0' * (2**12) - + b'') - soup = self.soup(markup) - self.assertEqual(soup.encode("utf-8"), markup) - - - def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): - self.assertSoupEquals("

", "

") - self.assertSoupEquals("

foo

") - - def test_namespaces_are_preserved(self): - markup = 'This tag is in the a namespaceThis tag is in the b namespace' - soup = self.soup(markup) - root = soup.root - self.assertEqual("http://example.com/", root['xmlns:a']) - self.assertEqual("http://example.net/", root['xmlns:b']) - - def test_closing_namespaced_tag(self): - markup = '

20010504

' - soup = self.soup(markup) - self.assertEqual(str(soup.p), markup) - - def test_namespaced_attributes(self): - markup = '' - soup = self.soup(markup) - self.assertEqual(str(soup.foo), markup) - - def test_namespaced_attributes_xml_namespace(self): - markup = 'bar' - soup = self.soup(markup) - self.assertEqual(str(soup.foo), markup) - - def test_find_by_prefixed_name(self): - doc = """ -foo - bar - baz - -""" - soup = self.soup(doc) - - # There are three tags. - self.assertEqual(3, len(soup.find_all('tag'))) - - # But two of them are ns1:tag and one of them is ns2:tag. - self.assertEqual(2, len(soup.find_all('ns1:tag'))) - self.assertEqual(1, len(soup.find_all('ns2:tag'))) - - self.assertEqual(1, len(soup.find_all('ns2:tag', key='value'))) - self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag']))) - - def test_copy_tag_preserves_namespace(self): - xml = """ -""" - - soup = self.soup(xml) - tag = soup.document - duplicate = copy.copy(tag) - - # The two tags have the same namespace prefix. - self.assertEqual(tag.prefix, duplicate.prefix) - - def test_worst_case(self): - """Test the worst case (currently) for linking issues.""" - - soup = self.soup(BAD_DOCUMENT) - self.linkage_validator(soup) - - -class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): - """Smoke test for a tree builder that supports HTML5.""" - - def test_real_xhtml_document(self): - # Since XHTML is not HTML5, HTML5 parsers are not tested to handle - # XHTML documents in any particular way. - pass - - def test_html_tags_have_namespace(self): - markup = "" - soup = self.soup(markup) - self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) - - def test_svg_tags_have_namespace(self): - markup = '' - soup = self.soup(markup) - namespace = "http://www.w3.org/2000/svg" - self.assertEqual(namespace, soup.svg.namespace) - self.assertEqual(namespace, soup.circle.namespace) - - - def test_mathml_tags_have_namespace(self): - markup = '5' - soup = self.soup(markup) - namespace = 'http://www.w3.org/1998/Math/MathML' - self.assertEqual(namespace, soup.math.namespace) - self.assertEqual(namespace, soup.msqrt.namespace) - - def test_xml_declaration_becomes_comment(self): - markup = '' - soup = self.soup(markup) - self.assertTrue(isinstance(soup.contents[0], Comment)) - self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') - self.assertEqual("html", soup.contents[0].next_element.name) - -def skipIf(condition, reason): - def nothing(test, *args, **kwargs): - return None - - def decorator(test_item): - if condition: - return nothing - else: - return test_item - - return decorator diff -Nru beautifulsoup4-4.10.0/bs4/tests/__init__.py beautifulsoup4-1.7.1-benchmark/bs4/tests/__init__.py --- beautifulsoup4-4.10.0/bs4/tests/__init__.py 2020-04-05 19:54:12.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bs4/tests/__init__.py 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -"The beautifulsoup tests." diff -Nru beautifulsoup4-4.10.0/bs4/tests/test_builder_registry.py beautifulsoup4-1.7.1-benchmark/bs4/tests/test_builder_registry.py --- beautifulsoup4-4.10.0/bs4/tests/test_builder_registry.py 2020-04-05 19:54:12.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bs4/tests/test_builder_registry.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,147 +0,0 @@ -"""Tests of the builder registry.""" - -import unittest -import warnings - -from bs4 import BeautifulSoup -from bs4.builder import ( - builder_registry as registry, - HTMLParserTreeBuilder, - TreeBuilderRegistry, -) - -try: - from bs4.builder import HTML5TreeBuilder - HTML5LIB_PRESENT = True -except ImportError: - HTML5LIB_PRESENT = False - -try: - from bs4.builder import ( - LXMLTreeBuilderForXML, - LXMLTreeBuilder, - ) - LXML_PRESENT = True -except ImportError: - LXML_PRESENT = False - - -class BuiltInRegistryTest(unittest.TestCase): - """Test the built-in registry with the default builders registered.""" - - def test_combination(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('fast', 'html'), - LXMLTreeBuilder) - - if LXML_PRESENT: - self.assertEqual(registry.lookup('permissive', 'xml'), - LXMLTreeBuilderForXML) - self.assertEqual(registry.lookup('strict', 'html'), - HTMLParserTreeBuilder) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html5lib', 'html'), - HTML5TreeBuilder) - - def test_lookup_by_markup_type(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) - self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) - else: - self.assertEqual(registry.lookup('xml'), None) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) - else: - self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) - - def test_named_library(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('lxml', 'xml'), - LXMLTreeBuilderForXML) - self.assertEqual(registry.lookup('lxml', 'html'), - LXMLTreeBuilder) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html5lib'), - HTML5TreeBuilder) - - self.assertEqual(registry.lookup('html.parser'), - HTMLParserTreeBuilder) - - def test_beautifulsoup_constructor_does_lookup(self): - - with warnings.catch_warnings(record=True) as w: - # This will create a warning about not explicitly - # specifying a parser, but we'll ignore it. - - # You can pass in a string. - BeautifulSoup("", features="html") - # Or a list of strings. - BeautifulSoup("", features=["html", "fast"]) - - # You'll get an exception if BS can't find an appropriate - # builder. - self.assertRaises(ValueError, BeautifulSoup, - "", features="no-such-feature") - -class RegistryTest(unittest.TestCase): - """Test the TreeBuilderRegistry class in general.""" - - def setUp(self): - self.registry = TreeBuilderRegistry() - - def builder_for_features(self, *feature_list): - cls = type('Builder_' + '_'.join(feature_list), - (object,), {'features' : feature_list}) - - self.registry.register(cls) - return cls - - def test_register_with_no_features(self): - builder = self.builder_for_features() - - # Since the builder advertises no features, you can't find it - # by looking up features. - self.assertEqual(self.registry.lookup('foo'), None) - - # But you can find it by doing a lookup with no features, if - # this happens to be the only registered builder. - self.assertEqual(self.registry.lookup(), builder) - - def test_register_with_features_makes_lookup_succeed(self): - builder = self.builder_for_features('foo', 'bar') - self.assertEqual(self.registry.lookup('foo'), builder) - self.assertEqual(self.registry.lookup('bar'), builder) - - def test_lookup_fails_when_no_builder_implements_feature(self): - builder = self.builder_for_features('foo', 'bar') - self.assertEqual(self.registry.lookup('baz'), None) - - def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): - builder1 = self.builder_for_features('foo') - builder2 = self.builder_for_features('bar') - self.assertEqual(self.registry.lookup(), builder2) - - def test_lookup_fails_when_no_tree_builders_registered(self): - self.assertEqual(self.registry.lookup(), None) - - def test_lookup_gets_most_recent_builder_supporting_all_features(self): - has_one = self.builder_for_features('foo') - has_the_other = self.builder_for_features('bar') - has_both_early = self.builder_for_features('foo', 'bar', 'baz') - has_both_late = self.builder_for_features('foo', 'bar', 'quux') - lacks_one = self.builder_for_features('bar') - has_the_other = self.builder_for_features('foo') - - # There are two builders featuring 'foo' and 'bar', but - # the one that also features 'quux' was registered later. - self.assertEqual(self.registry.lookup('foo', 'bar'), - has_both_late) - - # There is only one builder featuring 'foo', 'bar', and 'baz'. - self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), - has_both_early) - - def test_lookup_fails_when_cannot_reconcile_requested_features(self): - builder1 = self.builder_for_features('foo', 'bar') - builder2 = self.builder_for_features('foo', 'baz') - self.assertEqual(self.registry.lookup('bar', 'baz'), None) diff -Nru beautifulsoup4-4.10.0/bs4/tests/test_docs.py beautifulsoup4-1.7.1-benchmark/bs4/tests/test_docs.py --- beautifulsoup4-4.10.0/bs4/tests/test_docs.py 2020-04-05 19:54:12.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bs4/tests/test_docs.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,36 +0,0 @@ -"Test harness for doctests." - -# pylint: disable-msg=E0611,W0142 - -__metaclass__ = type -__all__ = [ - 'additional_tests', - ] - -import atexit -import doctest -import os -#from pkg_resources import ( -# resource_filename, resource_exists, resource_listdir, cleanup_resources) -import unittest - -DOCTEST_FLAGS = ( - doctest.ELLIPSIS | - doctest.NORMALIZE_WHITESPACE | - doctest.REPORT_NDIFF) - - -# def additional_tests(): -# "Run the doc tests (README.txt and docs/*, if any exist)" -# doctest_files = [ -# os.path.abspath(resource_filename('bs4', 'README.txt'))] -# if resource_exists('bs4', 'docs'): -# for name in resource_listdir('bs4', 'docs'): -# if name.endswith('.txt'): -# doctest_files.append( -# os.path.abspath( -# resource_filename('bs4', 'docs/%s' % name))) -# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) -# atexit.register(cleanup_resources) -# return unittest.TestSuite(( -# doctest.DocFileSuite(*doctest_files, **kwargs))) diff -Nru beautifulsoup4-4.10.0/bs4/tests/test_html5lib.py beautifulsoup4-1.7.1-benchmark/bs4/tests/test_html5lib.py --- beautifulsoup4-4.10.0/bs4/tests/test_html5lib.py 2021-09-07 23:36:47.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bs4/tests/test_html5lib.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,226 +0,0 @@ -"""Tests to ensure that the html5lib tree builder generates good trees.""" - -import warnings - -try: - from bs4.builder import HTML5TreeBuilder - HTML5LIB_PRESENT = True -except ImportError as e: - HTML5LIB_PRESENT = False -from bs4.element import SoupStrainer -from bs4.testing import ( - HTML5TreeBuilderSmokeTest, - SoupTest, - skipIf, -) - -@skipIf( - not HTML5LIB_PRESENT, - "html5lib seems not to be present, not testing its tree builder.") -class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): - """See ``HTML5TreeBuilderSmokeTest``.""" - - @property - def default_builder(self): - return HTML5TreeBuilder - - def test_soupstrainer(self): - # The html5lib tree builder does not support SoupStrainers. - strainer = SoupStrainer("b") - markup = "

A bold statement.

" - with warnings.catch_warnings(record=True) as w: - soup = self.soup(markup, parse_only=strainer) - self.assertEqual( - soup.decode(), self.document_for(markup)) - - self.assertTrue( - "the html5lib tree builder doesn't support parse_only" in - str(w[0].message)) - - def test_correctly_nested_tables(self): - """html5lib inserts tags where other parsers don't.""" - markup = ('' - '' - "') - - self.assertSoupEquals( - markup, - '
Here's another table:" - '' - '' - '
foo
Here\'s another table:' - '
foo
' - '
') - - self.assertSoupEquals( - "" - "" - "
Foo
Bar
Baz
") - - def test_xml_declaration_followed_by_doctype(self): - markup = ''' - - - - - -

foo

- -''' - soup = self.soup(markup) - # Verify that we can reach the

tag; this means the tree is connected. - self.assertEqual(b"

foo

", soup.p.encode()) - - def test_reparented_markup(self): - markup = '

foo

\n

bar

' - soup = self.soup(markup) - self.assertEqual("

foo

\n

bar

", soup.body.decode()) - self.assertEqual(2, len(soup.find_all('p'))) - - - def test_reparented_markup_ends_with_whitespace(self): - markup = '

foo

\n

bar

\n' - soup = self.soup(markup) - self.assertEqual("

foo

\n

bar

\n", soup.body.decode()) - self.assertEqual(2, len(soup.find_all('p'))) - - def test_reparented_markup_containing_identical_whitespace_nodes(self): - """Verify that we keep the two whitespace nodes in this - document distinct when reparenting the adjacent tags. - """ - markup = '
' - soup = self.soup(markup) - space1, space2 = soup.find_all(string=' ') - tbody1, tbody2 = soup.find_all('tbody') - assert space1.next_element is tbody1 - assert tbody2.next_element is space2 - - def test_reparented_markup_containing_children(self): - markup = '' - soup = self.soup(markup) - noscript = soup.noscript - self.assertEqual("target", noscript.next_element) - target = soup.find(string='target') - - # The 'aftermath' string was duplicated; we want the second one. - final_aftermath = soup.find_all(string='aftermath')[-1] - - # The