diff -Nru beautifulsoup4-4.10.0/appveyor.yml beautifulsoup4-1.7.1-benchmark/appveyor.yml --- beautifulsoup4-4.10.0/appveyor.yml 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/appveyor.yml 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,50 @@ +version: '{build}' + +image: Visual Studio 2017 + +configuration: + - Debug + - Release + +environment: + matrix: + - compiler: msvc-15-seh + generator: "Visual Studio 15 2017" + + - compiler: msvc-15-seh + generator: "Visual Studio 15 2017 Win64" + + - compiler: msvc-14-seh + generator: "Visual Studio 14 2015" + + - compiler: msvc-14-seh + generator: "Visual Studio 14 2015 Win64" + + - compiler: gcc-5.3.0-posix + generator: "MinGW Makefiles" + cxx_path: 'C:\mingw-w64\i686-5.3.0-posix-dwarf-rt_v4-rev0\mingw32\bin' + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + +matrix: + fast_finish: true + +install: + # git bash conflicts with MinGW makefiles + - if "%generator%"=="MinGW Makefiles" (set "PATH=%PATH:C:\Program Files\Git\usr\bin;=%") + - if not "%cxx_path%"=="" (set "PATH=%PATH%;%cxx_path%") + +build_script: + - md _build -Force + - cd _build + - echo %configuration% + - cmake -G "%generator%" "-DCMAKE_BUILD_TYPE=%configuration%" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON .. + - cmake --build . --config %configuration% + +test_script: + - ctest --build-config %configuration% --timeout 300 --output-on-failure + +artifacts: + - path: '_build/CMakeFiles/*.log' + name: logs + - path: '_build/Testing/**/*.xml' + name: test_results diff -Nru beautifulsoup4-4.10.0/AUTHORS beautifulsoup4-1.7.1-benchmark/AUTHORS --- beautifulsoup4-4.10.0/AUTHORS 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/AUTHORS 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,67 @@ +# This is the official list of benchmark authors for copyright purposes. +# This file is distinct from the CONTRIBUTORS files. +# See the latter for an explanation. +# +# Names should be added to this file as: +# Name or Organization +# The email address is not required for organizations. +# +# Please keep the list sorted. + +Albert Pretorius +Alex Steele +Andriy Berestovskyy +Arne Beer +Carto +Cezary Skrzyński +Christian Wassermann +Christopher Seymour +Colin Braley +Daniel Harvey +David Coeurjolly +Deniz Evrenci +Dirac Research +Dominik Czarnota +Dominik Korman +Donald Aingworth +Eric Backus +Eric Fiselier +Eugene Zhuk +Evgeny Safronov +Federico Ficarelli +Felix Homann +Gergő Szitár +Google Inc. +International Business Machines Corporation +Ismael Jimenez Martinez +Jern-Kuan Leong +JianXiong Zhou +Joao Paulo Magalhaes +Jordan Williams +Jussi Knuuttila +Kaito Udagawa +Kishan Kumar +Lei Xu +Matt Clarkson +Maxim Vafin +MongoDB Inc. +Nick Hutchinson +Norman Heino +Oleksandr Sochka +Ori Livneh +Paul Redmond +Raghu Raja +Radoslav Yovchev +Rainer Orth +Roman Lebedev +Sayan Bhattacharjee +Shapr3D +Shuo Chen +Staffan Tjernstrom +Steinar H. Gunderson +Stripe, Inc. +Tobias Schmidt +Yixuan Qiu +Yusuke Suzuki +Zbigniew Skowron +Min-Yih Hsu diff -Nru beautifulsoup4-4.10.0/beautifulsoup4.egg-info/dependency_links.txt beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/dependency_links.txt --- beautifulsoup4-4.10.0/beautifulsoup4.egg-info/dependency_links.txt 2021-09-08 00:13:24.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/dependency_links.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ - diff -Nru beautifulsoup4-4.10.0/beautifulsoup4.egg-info/PKG-INFO beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/PKG-INFO --- beautifulsoup4-4.10.0/beautifulsoup4.egg-info/PKG-INFO 2021-09-08 00:13:24.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/PKG-INFO 1970-01-01 00:00:00.000000000 +0000 @@ -1,115 +0,0 @@ -Metadata-Version: 2.1 -Name: beautifulsoup4 -Version: 4.10.0 -Summary: Screen-scraping library -Home-page: http://www.crummy.com/software/BeautifulSoup/bs4/ -Author: Leonard Richardson -Author-email: leonardr@segfault.org -License: MIT -Download-URL: http://www.crummy.com/software/BeautifulSoup/bs4/download/ -Description: Beautiful Soup is a library that makes it easy to scrape information - from web pages. It sits atop an HTML or XML parser, providing Pythonic - idioms for iterating, searching, and modifying the parse tree. - - # Quick start - - ``` - >>> from bs4 import BeautifulSoup - >>> soup = BeautifulSoup("

SomebadHTML") - >>> print(soup.prettify()) - - -

- Some - - bad - - HTML - - -

- - - >>> soup.find(text="bad") - 'bad' - >>> soup.i - HTML - # - >>> soup = BeautifulSoup("SomebadXML", "xml") - # - >>> print(soup.prettify()) - - - Some - - bad - - XML - - - ``` - - To go beyond the basics, [comprehensive documentation is available](http://www.crummy.com/software/BeautifulSoup/bs4/doc/). - - # Links - - * [Homepage](http://www.crummy.com/software/BeautifulSoup/bs4/) - * [Documentation](http://www.crummy.com/software/BeautifulSoup/bs4/doc/) - * [Discussion group](http://groups.google.com/group/beautifulsoup/) - * [Development](https://code.launchpad.net/beautifulsoup/) - * [Bug tracker](https://bugs.launchpad.net/beautifulsoup/) - * [Complete changelog](https://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/view/head:/CHANGELOG) - - # Note on Python 2 sunsetting - - Beautiful Soup's support for Python 2 was discontinued on December 31, - 2020: one year after the sunset date for Python 2 itself. From this - point onward, new Beautiful Soup development will exclusively target - Python 3. The final release of Beautiful Soup 4 to support Python 2 - was 4.9.3. - - # Supporting the project - - If you use Beautiful Soup as part of your professional work, please consider a - [Tidelift subscription](https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=readme). - This will support many of the free software projects your organization - depends on, not just Beautiful Soup. - - If you use Beautiful Soup for personal projects, the best way to say - thank you is to read - [Tool Safety](https://www.crummy.com/software/BeautifulSoup/zine/), a zine I - wrote about what Beautiful Soup has taught me about software - development. - - # Building the documentation - - The bs4/doc/ directory contains full documentation in Sphinx - format. Run `make html` in that directory to create HTML - documentation. - - # Running the unit tests - - Beautiful Soup supports unit test discovery from the project root directory: - - ``` - $ nosetests - ``` - - ``` - $ python3 -m unittest discover -s bs4 - ``` - -Platform: UNKNOWN -Classifier: Development Status :: 5 - Production/Stable -Classifier: Intended Audience :: Developers -Classifier: License :: OSI Approved :: MIT License -Classifier: Programming Language :: Python -Classifier: Programming Language :: Python :: 3 -Classifier: Topic :: Text Processing :: Markup :: HTML -Classifier: Topic :: Text Processing :: Markup :: XML -Classifier: Topic :: Text Processing :: Markup :: SGML -Classifier: Topic :: Software Development :: Libraries :: Python Modules -Requires-Python: >3.0.0 -Description-Content-Type: text/markdown -Provides-Extra: html5lib -Provides-Extra: lxml diff -Nru beautifulsoup4-4.10.0/beautifulsoup4.egg-info/requires.txt beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/requires.txt --- beautifulsoup4-4.10.0/beautifulsoup4.egg-info/requires.txt 2021-09-08 00:13:24.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/requires.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -soupsieve>1.2 - -[html5lib] -html5lib - -[lxml] -lxml diff -Nru beautifulsoup4-4.10.0/beautifulsoup4.egg-info/SOURCES.txt beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/SOURCES.txt --- beautifulsoup4-4.10.0/beautifulsoup4.egg-info/SOURCES.txt 2021-09-08 00:13:24.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/SOURCES.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,53 +0,0 @@ -COPYING.txt -LICENSE -MANIFEST.in -NEWS.txt -README.md -TODO.txt -parse.txt -setup.cfg -setup.py -test-all-versions -beautifulsoup4.egg-info/PKG-INFO -beautifulsoup4.egg-info/SOURCES.txt -beautifulsoup4.egg-info/dependency_links.txt -beautifulsoup4.egg-info/requires.txt -beautifulsoup4.egg-info/top_level.txt -bs4/__init__.py -bs4/dammit.py -bs4/diagnose.py -bs4/element.py -bs4/formatter.py -bs4/testing.py -bs4/builder/__init__.py -bs4/builder/_html5lib.py -bs4/builder/_htmlparser.py -bs4/builder/_lxml.py -bs4/tests/__init__.py -bs4/tests/test_builder_registry.py -bs4/tests/test_docs.py -bs4/tests/test_html5lib.py -bs4/tests/test_htmlparser.py -bs4/tests/test_lxml.py -bs4/tests/test_soup.py -bs4/tests/test_tree.py -doc/Makefile -doc.ptbr/Makefile -doc.ptbr/source/6.1.jpg -doc.ptbr/source/conf.py -doc.ptbr/source/index.rst -doc.ru/Makefile -doc.ru/source/6.1.jpg -doc.ru/source/bs4ru.rst -doc.ru/source/conf.py -doc.ru/source/index.rst -doc.zh/Makefile -doc.zh/source/6.1.jpg -doc.zh/source/conf.py -doc.zh/source/index.rst -doc/source/6.1.jpg -doc/source/check_doc.py -doc/source/conf.py -doc/source/index.rst -scripts/demonstrate_parser_differences.py -scripts/demonstration_markup.txt \ No newline at end of file diff -Nru beautifulsoup4-4.10.0/beautifulsoup4.egg-info/top_level.txt beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/top_level.txt --- beautifulsoup4-4.10.0/beautifulsoup4.egg-info/top_level.txt 2021-09-08 00:13:24.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/beautifulsoup4.egg-info/top_level.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -bs4 diff -Nru beautifulsoup4-4.10.0/bindings/python/BUILD beautifulsoup4-1.7.1-benchmark/bindings/python/BUILD --- beautifulsoup4-4.10.0/bindings/python/BUILD 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/BUILD 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,3 @@ +exports_files(glob(["*.BUILD"])) +exports_files(["build_defs.bzl"]) + diff -Nru beautifulsoup4-4.10.0/bindings/python/build_defs.bzl beautifulsoup4-1.7.1-benchmark/bindings/python/build_defs.bzl --- beautifulsoup4-4.10.0/bindings/python/build_defs.bzl 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/build_defs.bzl 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,25 @@ +_SHARED_LIB_SUFFIX = { + "//conditions:default": ".so", + "//:windows": ".dll", +} + +def py_extension(name, srcs, hdrs = [], copts = [], features = [], deps = []): + for shared_lib_suffix in _SHARED_LIB_SUFFIX.values(): + shared_lib_name = name + shared_lib_suffix + native.cc_binary( + name = shared_lib_name, + linkshared = True, + linkstatic = True, + srcs = srcs + hdrs, + copts = copts, + features = features, + deps = deps, + ) + + return native.py_library( + name = name, + data = select({ + platform: [name + shared_lib_suffix] + for platform, shared_lib_suffix in _SHARED_LIB_SUFFIX.items() + }), + ) diff -Nru beautifulsoup4-4.10.0/bindings/python/google_benchmark/benchmark.cc beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/benchmark.cc --- beautifulsoup4-4.10.0/bindings/python/google_benchmark/benchmark.cc 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/benchmark.cc 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,184 @@ +// Benchmark for Python. + +#include "benchmark/benchmark.h" + +#include +#include +#include + +#include "pybind11/operators.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" +#include "pybind11/stl_bind.h" + +PYBIND11_MAKE_OPAQUE(benchmark::UserCounters); + +namespace { +namespace py = ::pybind11; + +std::vector Initialize(const std::vector& argv) { + // The `argv` pointers here become invalid when this function returns, but + // benchmark holds the pointer to `argv[0]`. We create a static copy of it + // so it persists, and replace the pointer below. + static std::string executable_name(argv[0]); + std::vector ptrs; + ptrs.reserve(argv.size()); + for (auto& arg : argv) { + ptrs.push_back(const_cast(arg.c_str())); + } + ptrs[0] = const_cast(executable_name.c_str()); + int argc = static_cast(argv.size()); + benchmark::Initialize(&argc, ptrs.data()); + std::vector remaining_argv; + remaining_argv.reserve(argc); + for (int i = 0; i < argc; ++i) { + remaining_argv.emplace_back(ptrs[i]); + } + return remaining_argv; +} + +benchmark::internal::Benchmark* RegisterBenchmark(const char* name, + py::function f) { + return benchmark::RegisterBenchmark( + name, [f](benchmark::State& state) { f(&state); }); +} + +PYBIND11_MODULE(_benchmark, m) { + using benchmark::TimeUnit; + py::enum_(m, "TimeUnit") + .value("kNanosecond", TimeUnit::kNanosecond) + .value("kMicrosecond", TimeUnit::kMicrosecond) + .value("kMillisecond", TimeUnit::kMillisecond) + .value("kSecond", TimeUnit::kSecond) + .export_values(); + + using benchmark::BigO; + py::enum_(m, "BigO") + .value("oNone", BigO::oNone) + .value("o1", BigO::o1) + .value("oN", BigO::oN) + .value("oNSquared", BigO::oNSquared) + .value("oNCubed", BigO::oNCubed) + .value("oLogN", BigO::oLogN) + .value("oNLogN", BigO::oLogN) + .value("oAuto", BigO::oAuto) + .value("oLambda", BigO::oLambda) + .export_values(); + + using benchmark::internal::Benchmark; + py::class_(m, "Benchmark") + // For methods returning a pointer tor the current object, reference + // return policy is used to ask pybind not to take ownership oof the + // returned object and avoid calling delete on it. + // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies + // + // For methods taking a const std::vector<...>&, a copy is created + // because a it is bound to a Python list. + // https://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html + .def("unit", &Benchmark::Unit, py::return_value_policy::reference) + .def("arg", &Benchmark::Arg, py::return_value_policy::reference) + .def("args", &Benchmark::Args, py::return_value_policy::reference) + .def("range", &Benchmark::Range, py::return_value_policy::reference, + py::arg("start"), py::arg("limit")) + .def("dense_range", &Benchmark::DenseRange, + py::return_value_policy::reference, py::arg("start"), + py::arg("limit"), py::arg("step") = 1) + .def("ranges", &Benchmark::Ranges, py::return_value_policy::reference) + .def("args_product", &Benchmark::ArgsProduct, + py::return_value_policy::reference) + .def("arg_name", &Benchmark::ArgName, py::return_value_policy::reference) + .def("arg_names", &Benchmark::ArgNames, + py::return_value_policy::reference) + .def("range_pair", &Benchmark::RangePair, + py::return_value_policy::reference, py::arg("lo1"), py::arg("hi1"), + py::arg("lo2"), py::arg("hi2")) + .def("range_multiplier", &Benchmark::RangeMultiplier, + py::return_value_policy::reference) + .def("min_time", &Benchmark::MinTime, py::return_value_policy::reference) + .def("min_warmup_time", &Benchmark::MinWarmUpTime, + py::return_value_policy::reference) + .def("iterations", &Benchmark::Iterations, + py::return_value_policy::reference) + .def("repetitions", &Benchmark::Repetitions, + py::return_value_policy::reference) + .def("report_aggregates_only", &Benchmark::ReportAggregatesOnly, + py::return_value_policy::reference, py::arg("value") = true) + .def("display_aggregates_only", &Benchmark::DisplayAggregatesOnly, + py::return_value_policy::reference, py::arg("value") = true) + .def("measure_process_cpu_time", &Benchmark::MeasureProcessCPUTime, + py::return_value_policy::reference) + .def("use_real_time", &Benchmark::UseRealTime, + py::return_value_policy::reference) + .def("use_manual_time", &Benchmark::UseManualTime, + py::return_value_policy::reference) + .def( + "complexity", + (Benchmark * (Benchmark::*)(benchmark::BigO)) & Benchmark::Complexity, + py::return_value_policy::reference, + py::arg("complexity") = benchmark::oAuto); + + using benchmark::Counter; + py::class_ py_counter(m, "Counter"); + + py::enum_(py_counter, "Flags") + .value("kDefaults", Counter::Flags::kDefaults) + .value("kIsRate", Counter::Flags::kIsRate) + .value("kAvgThreads", Counter::Flags::kAvgThreads) + .value("kAvgThreadsRate", Counter::Flags::kAvgThreadsRate) + .value("kIsIterationInvariant", Counter::Flags::kIsIterationInvariant) + .value("kIsIterationInvariantRate", + Counter::Flags::kIsIterationInvariantRate) + .value("kAvgIterations", Counter::Flags::kAvgIterations) + .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate) + .value("kInvert", Counter::Flags::kInvert) + .export_values() + .def(py::self | py::self); + + py::enum_(py_counter, "OneK") + .value("kIs1000", Counter::OneK::kIs1000) + .value("kIs1024", Counter::OneK::kIs1024) + .export_values(); + + py_counter + .def(py::init(), + py::arg("value") = 0., py::arg("flags") = Counter::kDefaults, + py::arg("k") = Counter::kIs1000) + .def(py::init([](double value) { return Counter(value); })) + .def_readwrite("value", &Counter::value) + .def_readwrite("flags", &Counter::flags) + .def_readwrite("oneK", &Counter::oneK); + py::implicitly_convertible(); + py::implicitly_convertible(); + + py::bind_map(m, "UserCounters"); + + using benchmark::State; + py::class_(m, "State") + .def("__bool__", &State::KeepRunning) + .def_property_readonly("keep_running", &State::KeepRunning) + .def("pause_timing", &State::PauseTiming) + .def("resume_timing", &State::ResumeTiming) + .def("skip_with_error", &State::SkipWithError) + .def_property_readonly("error_occurred", &State::error_occurred) + .def("set_iteration_time", &State::SetIterationTime) + .def_property("bytes_processed", &State::bytes_processed, + &State::SetBytesProcessed) + .def_property("complexity_n", &State::complexity_length_n, + &State::SetComplexityN) + .def_property("items_processed", &State::items_processed, + &State::SetItemsProcessed) + .def("set_label", (void (State::*)(const char*)) & State::SetLabel) + .def("range", &State::range, py::arg("pos") = 0) + .def_property_readonly("iterations", &State::iterations) + .def_readwrite("counters", &State::counters) + .def_property_readonly("thread_index", &State::thread_index) + .def_property_readonly("threads", &State::threads); + + m.def("Initialize", Initialize); + m.def("RegisterBenchmark", RegisterBenchmark, + py::return_value_policy::reference); + m.def("RunSpecifiedBenchmarks", + []() { benchmark::RunSpecifiedBenchmarks(); }); + m.def("ClearRegisteredBenchmarks", benchmark::ClearRegisteredBenchmarks); +}; +} // namespace diff -Nru beautifulsoup4-4.10.0/bindings/python/google_benchmark/BUILD beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/BUILD --- beautifulsoup4-4.10.0/bindings/python/google_benchmark/BUILD 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/BUILD 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,38 @@ +load("//bindings/python:build_defs.bzl", "py_extension") + +py_library( + name = "google_benchmark", + srcs = ["__init__.py"], + visibility = ["//visibility:public"], + deps = [ + ":_benchmark", + # pip; absl:app + ], +) + +py_extension( + name = "_benchmark", + srcs = ["benchmark.cc"], + copts = [ + "-fexceptions", + "-fno-strict-aliasing", + ], + features = ["-use_header_modules"], + deps = [ + "//:benchmark", + "@pybind11", + "@python_headers", + ], +) + +py_test( + name = "example", + srcs = ["example.py"], + python_version = "PY3", + srcs_version = "PY3", + visibility = ["//visibility:public"], + deps = [ + ":google_benchmark", + ], +) + diff -Nru beautifulsoup4-4.10.0/bindings/python/google_benchmark/example.py beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/example.py --- beautifulsoup4-4.10.0/bindings/python/google_benchmark/example.py 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/example.py 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,136 @@ +# Copyright 2020 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Example of Python using C++ benchmark framework. + +To run this example, you must first install the `google_benchmark` Python package. + +To install using `setup.py`, download and extract the `google_benchmark` source. +In the extracted directory, execute: + python setup.py install +""" + +import random +import time + +import google_benchmark as benchmark +from google_benchmark import Counter + + +@benchmark.register +def empty(state): + while state: + pass + + +@benchmark.register +def sum_million(state): + while state: + sum(range(1_000_000)) + +@benchmark.register +def pause_timing(state): + """Pause timing every iteration.""" + while state: + # Construct a list of random ints every iteration without timing it + state.pause_timing() + random_list = [random.randint(0, 100) for _ in range(100)] + state.resume_timing() + # Time the in place sorting algorithm + random_list.sort() + + +@benchmark.register +def skipped(state): + if True: # Test some predicate here. + state.skip_with_error("some error") + return # NOTE: You must explicitly return, or benchmark will continue. + + ... # Benchmark code would be here. + + +@benchmark.register +def manual_timing(state): + while state: + # Manually count Python CPU time + start = time.perf_counter() # perf_counter_ns() in Python 3.7+ + # Something to benchmark + time.sleep(0.01) + end = time.perf_counter() + state.set_iteration_time(end - start) + + +@benchmark.register +def custom_counters(state): + """Collect cutom metric using benchmark.Counter.""" + num_foo = 0.0 + while state: + # Benchmark some code here + pass + # Collect some custom metric named foo + num_foo += 0.13 + + # Automatic Counter from numbers. + state.counters["foo"] = num_foo + # Set a counter as a rate. + state.counters["foo_rate"] = Counter(num_foo, Counter.kIsRate) + # Set a counter as an inverse of rate. + state.counters["foo_inv_rate"] = Counter(num_foo, Counter.kIsRate | Counter.kInvert) + # Set a counter as a thread-average quantity. + state.counters["foo_avg"] = Counter(num_foo, Counter.kAvgThreads) + # There's also a combined flag: + state.counters["foo_avg_rate"] = Counter(num_foo, Counter.kAvgThreadsRate) + + +@benchmark.register +@benchmark.option.measure_process_cpu_time() +@benchmark.option.use_real_time() +def with_options(state): + while state: + sum(range(1_000_000)) + + +@benchmark.register(name="sum_million_microseconds") +@benchmark.option.unit(benchmark.kMicrosecond) +def with_options2(state): + while state: + sum(range(1_000_000)) + + +@benchmark.register +@benchmark.option.arg(100) +@benchmark.option.arg(1000) +def passing_argument(state): + while state: + sum(range(state.range(0))) + + +@benchmark.register +@benchmark.option.range(8, limit=8 << 10) +def using_range(state): + while state: + sum(range(state.range(0))) + + +@benchmark.register +@benchmark.option.range_multiplier(2) +@benchmark.option.range(1 << 10, 1 << 18) +@benchmark.option.complexity(benchmark.oN) +def computing_complexity(state): + while state: + sum(range(state.range(0))) + state.complexity_n = state.range(0) + + +if __name__ == "__main__": + benchmark.main() diff -Nru beautifulsoup4-4.10.0/bindings/python/google_benchmark/__init__.py beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/__init__.py --- beautifulsoup4-4.10.0/bindings/python/google_benchmark/__init__.py 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/google_benchmark/__init__.py 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,162 @@ +# Copyright 2020 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Python benchmarking utilities. + +Example usage: + import google_benchmark as benchmark + + @benchmark.register + def my_benchmark(state): + ... # Code executed outside `while` loop is not timed. + + while state: + ... # Code executed within `while` loop is timed. + + if __name__ == '__main__': + benchmark.main() +""" +import atexit + +from absl import app +from google_benchmark import _benchmark +from google_benchmark._benchmark import ( + Counter, + kNanosecond, + kMicrosecond, + kMillisecond, + kSecond, + oNone, + o1, + oN, + oNSquared, + oNCubed, + oLogN, + oNLogN, + oAuto, + oLambda, + State, +) + + +__all__ = [ + "register", + "main", + "Counter", + "kNanosecond", + "kMicrosecond", + "kMillisecond", + "kSecond", + "oNone", + "o1", + "oN", + "oNSquared", + "oNCubed", + "oLogN", + "oNLogN", + "oAuto", + "oLambda", + "State", +] + +__version__ = "1.7.1" + + +class __OptionMaker: + """A stateless class to collect benchmark options. + + Collect all decorator calls like @option.range(start=0, limit=1<<5). + """ + + class Options: + """Pure data class to store options calls, along with the benchmarked function.""" + + def __init__(self, func): + self.func = func + self.builder_calls = [] + + @classmethod + def make(cls, func_or_options): + """Make Options from Options or the benchmarked function.""" + if isinstance(func_or_options, cls.Options): + return func_or_options + return cls.Options(func_or_options) + + def __getattr__(self, builder_name): + """Append option call in the Options.""" + + # The function that get returned on @option.range(start=0, limit=1<<5). + def __builder_method(*args, **kwargs): + + # The decorator that get called, either with the benchmared function + # or the previous Options + def __decorator(func_or_options): + options = self.make(func_or_options) + options.builder_calls.append((builder_name, args, kwargs)) + # The decorator returns Options so it is not technically a decorator + # and needs a final call to @regiser + return options + + return __decorator + + return __builder_method + + +# Alias for nicer API. +# We have to instantiate an object, even if stateless, to be able to use __getattr__ +# on option.range +option = __OptionMaker() + + +def register(undefined=None, *, name=None): + """Register function for benchmarking.""" + if undefined is None: + # Decorator is called without parenthesis so we return a decorator + return lambda f: register(f, name=name) + + # We have either the function to benchmark (simple case) or an instance of Options + # (@option._ case). + options = __OptionMaker.make(undefined) + + if name is None: + name = options.func.__name__ + + # We register the benchmark and reproduce all the @option._ calls onto the + # benchmark builder pattern + benchmark = _benchmark.RegisterBenchmark(name, options.func) + for name, args, kwargs in options.builder_calls[::-1]: + getattr(benchmark, name)(*args, **kwargs) + + # return the benchmarked function because the decorator does not modify it + return options.func + + +def _flags_parser(argv): + argv = _benchmark.Initialize(argv) + return app.parse_flags_with_usage(argv) + + +def _run_benchmarks(argv): + if len(argv) > 1: + raise app.UsageError("Too many command-line arguments.") + return _benchmark.RunSpecifiedBenchmarks() + + +def main(argv=None): + return app.run(_run_benchmarks, argv=argv, flags_parser=_flags_parser) + + +# Methods for use with custom main function. +initialize = _benchmark.Initialize +run_benchmarks = _benchmark.RunSpecifiedBenchmarks +atexit.register(_benchmark.ClearRegisteredBenchmarks) diff -Nru beautifulsoup4-4.10.0/bindings/python/pybind11.BUILD beautifulsoup4-1.7.1-benchmark/bindings/python/pybind11.BUILD --- beautifulsoup4-4.10.0/bindings/python/pybind11.BUILD 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/pybind11.BUILD 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,20 @@ +cc_library( + name = "pybind11", + hdrs = glob( + include = [ + "include/pybind11/*.h", + "include/pybind11/detail/*.h", + ], + exclude = [ + "include/pybind11/common.h", + "include/pybind11/eigen.h", + ], + ), + copts = [ + "-fexceptions", + "-Wno-undefined-inline", + "-Wno-pragma-once-outside-header", + ], + includes = ["include"], + visibility = ["//visibility:public"], +) diff -Nru beautifulsoup4-4.10.0/bindings/python/python_headers.BUILD beautifulsoup4-1.7.1-benchmark/bindings/python/python_headers.BUILD --- beautifulsoup4-4.10.0/bindings/python/python_headers.BUILD 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/python_headers.BUILD 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,6 @@ +cc_library( + name = "python_headers", + hdrs = glob(["**/*.h"]), + includes = ["."], + visibility = ["//visibility:public"], +) diff -Nru beautifulsoup4-4.10.0/bindings/python/requirements.txt beautifulsoup4-1.7.1-benchmark/bindings/python/requirements.txt --- beautifulsoup4-4.10.0/bindings/python/requirements.txt 1970-01-01 00:00:00.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bindings/python/requirements.txt 2022-11-11 14:01:03.000000000 +0000 @@ -0,0 +1,2 @@ +absl-py>=0.7.1 + diff -Nru beautifulsoup4-4.10.0/bs4/builder/_html5lib.py beautifulsoup4-1.7.1-benchmark/bs4/builder/_html5lib.py --- beautifulsoup4-4.10.0/bs4/builder/_html5lib.py 2021-09-07 23:36:46.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bs4/builder/_html5lib.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,467 +0,0 @@ -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -__all__ = [ - 'HTML5TreeBuilder', - ] - -import warnings -import re -from bs4.builder import ( - PERMISSIVE, - HTML, - HTML_5, - HTMLTreeBuilder, - ) -from bs4.element import ( - NamespacedAttribute, - nonwhitespace_re, -) -import html5lib -from html5lib.constants import ( - namespaces, - prefixes, - ) -from bs4.element import ( - Comment, - Doctype, - NavigableString, - Tag, - ) - -try: - # Pre-0.99999999 - from html5lib.treebuilders import _base as treebuilder_base - new_html5lib = False -except ImportError as e: - # 0.99999999 and up - from html5lib.treebuilders import base as treebuilder_base - new_html5lib = True - -class HTML5TreeBuilder(HTMLTreeBuilder): - """Use html5lib to build a tree. - - Note that this TreeBuilder does not support some features common - to HTML TreeBuilders. Some of these features could theoretically - be implemented, but at the very least it's quite difficult, - because html5lib moves the parse tree around as it's being built. - - * This TreeBuilder doesn't use different subclasses of NavigableString - based on the name of the tag in which the string was found. - - * You can't use a SoupStrainer to parse only part of a document. - """ - - NAME = "html5lib" - - features = [NAME, PERMISSIVE, HTML_5, HTML] - - # html5lib can tell us which line number and position in the - # original file is the source of an element. - TRACKS_LINE_NUMBERS = True - - def prepare_markup(self, markup, user_specified_encoding, - document_declared_encoding=None, exclude_encodings=None): - # Store the user-specified encoding for use later on. - self.user_specified_encoding = user_specified_encoding - - # document_declared_encoding and exclude_encodings aren't used - # ATM because the html5lib TreeBuilder doesn't use - # UnicodeDammit. - if exclude_encodings: - warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") - yield (markup, None, None, False) - - # These methods are defined by Beautiful Soup. - def feed(self, markup): - if self.soup.parse_only is not None: - warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") - parser = html5lib.HTMLParser(tree=self.create_treebuilder) - self.underlying_builder.parser = parser - extra_kwargs = dict() - if not isinstance(markup, str): - if new_html5lib: - extra_kwargs['override_encoding'] = self.user_specified_encoding - else: - extra_kwargs['encoding'] = self.user_specified_encoding - doc = parser.parse(markup, **extra_kwargs) - - # Set the character encoding detected by the tokenizer. - if isinstance(markup, str): - # We need to special-case this because html5lib sets - # charEncoding to UTF-8 if it gets Unicode input. - doc.original_encoding = None - else: - original_encoding = parser.tokenizer.stream.charEncoding[0] - if not isinstance(original_encoding, str): - # In 0.99999999 and up, the encoding is an html5lib - # Encoding object. We want to use a string for compatibility - # with other tree builders. - original_encoding = original_encoding.name - doc.original_encoding = original_encoding - self.underlying_builder.parser = None - - def create_treebuilder(self, namespaceHTMLElements): - self.underlying_builder = TreeBuilderForHtml5lib( - namespaceHTMLElements, self.soup, - store_line_numbers=self.store_line_numbers - ) - return self.underlying_builder - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return '%s' % fragment - - -class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): - - def __init__(self, namespaceHTMLElements, soup=None, - store_line_numbers=True, **kwargs): - if soup: - self.soup = soup - else: - from bs4 import BeautifulSoup - # TODO: Why is the parser 'html.parser' here? To avoid an - # infinite loop? - self.soup = BeautifulSoup( - "", "html.parser", store_line_numbers=store_line_numbers, - **kwargs - ) - # TODO: What are **kwargs exactly? Should they be passed in - # here in addition to/instead of being passed to the BeautifulSoup - # constructor? - super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) - - # This will be set later to an html5lib.html5parser.HTMLParser - # object, which we can use to track the current line number. - self.parser = None - self.store_line_numbers = store_line_numbers - - def documentClass(self): - self.soup.reset() - return Element(self.soup, self.soup, None) - - def insertDoctype(self, token): - name = token["name"] - publicId = token["publicId"] - systemId = token["systemId"] - - doctype = Doctype.for_name_and_ids(name, publicId, systemId) - self.soup.object_was_parsed(doctype) - - def elementClass(self, name, namespace): - kwargs = {} - if self.parser and self.store_line_numbers: - # This represents the point immediately after the end of the - # tag. We don't know when the tag started, but we do know - # where it ended -- the character just before this one. - sourceline, sourcepos = self.parser.tokenizer.stream.position() - kwargs['sourceline'] = sourceline - kwargs['sourcepos'] = sourcepos-1 - tag = self.soup.new_tag(name, namespace, **kwargs) - - return Element(tag, self.soup, namespace) - - def commentClass(self, data): - return TextNode(Comment(data), self.soup) - - def fragmentClass(self): - from bs4 import BeautifulSoup - # TODO: Why is the parser 'html.parser' here? To avoid an - # infinite loop? - self.soup = BeautifulSoup("", "html.parser") - self.soup.name = "[document_fragment]" - return Element(self.soup, self.soup, None) - - def appendChild(self, node): - # XXX This code is not covered by the BS4 tests. - self.soup.append(node.element) - - def getDocument(self): - return self.soup - - def getFragment(self): - return treebuilder_base.TreeBuilder.getFragment(self).element - - def testSerializer(self, element): - from bs4 import BeautifulSoup - rv = [] - doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') - - def serializeElement(element, indent=0): - if isinstance(element, BeautifulSoup): - pass - if isinstance(element, Doctype): - m = doctype_re.match(element) - if m: - name = m.group(1) - if m.lastindex > 1: - publicId = m.group(2) or "" - systemId = m.group(3) or m.group(4) or "" - rv.append("""|%s""" % - (' ' * indent, name, publicId, systemId)) - else: - rv.append("|%s" % (' ' * indent, name)) - else: - rv.append("|%s" % (' ' * indent,)) - elif isinstance(element, Comment): - rv.append("|%s" % (' ' * indent, element)) - elif isinstance(element, NavigableString): - rv.append("|%s\"%s\"" % (' ' * indent, element)) - else: - if element.namespace: - name = "%s %s" % (prefixes[element.namespace], - element.name) - else: - name = element.name - rv.append("|%s<%s>" % (' ' * indent, name)) - if element.attrs: - attributes = [] - for name, value in list(element.attrs.items()): - if isinstance(name, NamespacedAttribute): - name = "%s %s" % (prefixes[name.namespace], name.name) - if isinstance(value, list): - value = " ".join(value) - attributes.append((name, value)) - - for name, value in sorted(attributes): - rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) - indent += 2 - for child in element.children: - serializeElement(child, indent) - serializeElement(element, 0) - - return "\n".join(rv) - -class AttrList(object): - def __init__(self, element): - self.element = element - self.attrs = dict(self.element.attrs) - def __iter__(self): - return list(self.attrs.items()).__iter__() - def __setitem__(self, name, value): - # If this attribute is a multi-valued attribute for this element, - # turn its value into a list. - list_attr = self.element.cdata_list_attributes - if (name in list_attr['*'] - or (self.element.name in list_attr - and name in list_attr[self.element.name])): - # A node that is being cloned may have already undergone - # this procedure. - if not isinstance(value, list): - value = nonwhitespace_re.findall(value) - self.element[name] = value - def items(self): - return list(self.attrs.items()) - def keys(self): - return list(self.attrs.keys()) - def __len__(self): - return len(self.attrs) - def __getitem__(self, name): - return self.attrs[name] - def __contains__(self, name): - return name in list(self.attrs.keys()) - - -class Element(treebuilder_base.Node): - def __init__(self, element, soup, namespace): - treebuilder_base.Node.__init__(self, element.name) - self.element = element - self.soup = soup - self.namespace = namespace - - def appendChild(self, node): - string_child = child = None - if isinstance(node, str): - # Some other piece of code decided to pass in a string - # instead of creating a TextElement object to contain the - # string. - string_child = child = node - elif isinstance(node, Tag): - # Some other piece of code decided to pass in a Tag - # instead of creating an Element object to contain the - # Tag. - child = node - elif node.element.__class__ == NavigableString: - string_child = child = node.element - node.parent = self - else: - child = node.element - node.parent = self - - if not isinstance(child, str) and child.parent is not None: - node.element.extract() - - if (string_child is not None and self.element.contents - and self.element.contents[-1].__class__ == NavigableString): - # We are appending a string onto another string. - # TODO This has O(n^2) performance, for input like - # "aaa..." - old_element = self.element.contents[-1] - new_element = self.soup.new_string(old_element + string_child) - old_element.replace_with(new_element) - self.soup._most_recent_element = new_element - else: - if isinstance(node, str): - # Create a brand new NavigableString from this string. - child = self.soup.new_string(node) - - # Tell Beautiful Soup to act as if it parsed this element - # immediately after the parent's last descendant. (Or - # immediately after the parent, if it has no children.) - if self.element.contents: - most_recent_element = self.element._last_descendant(False) - elif self.element.next_element is not None: - # Something from further ahead in the parse tree is - # being inserted into this earlier element. This is - # very annoying because it means an expensive search - # for the last element in the tree. - most_recent_element = self.soup._last_descendant() - else: - most_recent_element = self.element - - self.soup.object_was_parsed( - child, parent=self.element, - most_recent_element=most_recent_element) - - def getAttributes(self): - if isinstance(self.element, Comment): - return {} - return AttrList(self.element) - - def setAttributes(self, attributes): - if attributes is not None and len(attributes) > 0: - converted_attributes = [] - for name, value in list(attributes.items()): - if isinstance(name, tuple): - new_name = NamespacedAttribute(*name) - del attributes[name] - attributes[new_name] = value - - self.soup.builder._replace_cdata_list_attribute_values( - self.name, attributes) - for name, value in list(attributes.items()): - self.element[name] = value - - # The attributes may contain variables that need substitution. - # Call set_up_substitutions manually. - # - # The Tag constructor called this method when the Tag was created, - # but we just set/changed the attributes, so call it again. - self.soup.builder.set_up_substitutions(self.element) - attributes = property(getAttributes, setAttributes) - - def insertText(self, data, insertBefore=None): - text = TextNode(self.soup.new_string(data), self.soup) - if insertBefore: - self.insertBefore(text, insertBefore) - else: - self.appendChild(text) - - def insertBefore(self, node, refNode): - index = self.element.index(refNode.element) - if (node.element.__class__ == NavigableString and self.element.contents - and self.element.contents[index-1].__class__ == NavigableString): - # (See comments in appendChild) - old_node = self.element.contents[index-1] - new_str = self.soup.new_string(old_node + node.element) - old_node.replace_with(new_str) - else: - self.element.insert(index, node.element) - node.parent = self - - def removeChild(self, node): - node.element.extract() - - def reparentChildren(self, new_parent): - """Move all of this tag's children into another tag.""" - # print("MOVE", self.element.contents) - # print("FROM", self.element) - # print("TO", new_parent.element) - - element = self.element - new_parent_element = new_parent.element - # Determine what this tag's next_element will be once all the children - # are removed. - final_next_element = element.next_sibling - - new_parents_last_descendant = new_parent_element._last_descendant(False, False) - if len(new_parent_element.contents) > 0: - # The new parent already contains children. We will be - # appending this tag's children to the end. - new_parents_last_child = new_parent_element.contents[-1] - new_parents_last_descendant_next_element = new_parents_last_descendant.next_element - else: - # The new parent contains no children. - new_parents_last_child = None - new_parents_last_descendant_next_element = new_parent_element.next_element - - to_append = element.contents - if len(to_append) > 0: - # Set the first child's previous_element and previous_sibling - # to elements within the new parent - first_child = to_append[0] - if new_parents_last_descendant is not None: - first_child.previous_element = new_parents_last_descendant - else: - first_child.previous_element = new_parent_element - first_child.previous_sibling = new_parents_last_child - if new_parents_last_descendant is not None: - new_parents_last_descendant.next_element = first_child - else: - new_parent_element.next_element = first_child - if new_parents_last_child is not None: - new_parents_last_child.next_sibling = first_child - - # Find the very last element being moved. It is now the - # parent's last descendant. It has no .next_sibling and - # its .next_element is whatever the previous last - # descendant had. - last_childs_last_descendant = to_append[-1]._last_descendant(False, True) - - last_childs_last_descendant.next_element = new_parents_last_descendant_next_element - if new_parents_last_descendant_next_element is not None: - # TODO: This code has no test coverage and I'm not sure - # how to get html5lib to go through this path, but it's - # just the other side of the previous line. - new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant - last_childs_last_descendant.next_sibling = None - - for child in to_append: - child.parent = new_parent_element - new_parent_element.contents.append(child) - - # Now that this element has no children, change its .next_element. - element.contents = [] - element.next_element = final_next_element - - # print("DONE WITH MOVE") - # print("FROM", self.element) - # print("TO", new_parent_element) - - def cloneNode(self): - tag = self.soup.new_tag(self.element.name, self.namespace) - node = Element(tag, self.soup, self.namespace) - for key,value in self.attributes: - node.attributes[key] = value - return node - - def hasContent(self): - return self.element.contents - - def getNameTuple(self): - if self.namespace == None: - return namespaces["html"], self.name - else: - return self.namespace, self.name - - nameTuple = property(getNameTuple) - -class TextNode(Element): - def __init__(self, element, soup): - treebuilder_base.Node.__init__(self, None) - self.element = element - self.soup = soup - - def cloneNode(self): - raise NotImplementedError diff -Nru beautifulsoup4-4.10.0/bs4/builder/_htmlparser.py beautifulsoup4-1.7.1-benchmark/bs4/builder/_htmlparser.py --- beautifulsoup4-4.10.0/bs4/builder/_htmlparser.py 2021-09-07 23:36:46.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bs4/builder/_htmlparser.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,492 +0,0 @@ -# encoding: utf-8 -"""Use the HTMLParser library to parse HTML files that aren't too bad.""" - -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -__all__ = [ - 'HTMLParserTreeBuilder', - ] - -from html.parser import HTMLParser - -try: - from html.parser import HTMLParseError -except ImportError as e: - # HTMLParseError is removed in Python 3.5. Since it can never be - # thrown in 3.5, we can just define our own class as a placeholder. - class HTMLParseError(Exception): - pass - -import sys -import warnings - -# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' -# argument, which we'd like to set to False. Unfortunately, -# http://bugs.python.org/issue13273 makes strict=True a better bet -# before Python 3.2.3. -# -# At the end of this file, we monkeypatch HTMLParser so that -# strict=True works well on Python 3.2.2. -major, minor, release = sys.version_info[:3] -CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 -CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 -CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 - - -from bs4.element import ( - CData, - Comment, - Declaration, - Doctype, - ProcessingInstruction, - ) -from bs4.dammit import EntitySubstitution, UnicodeDammit - -from bs4.builder import ( - HTML, - HTMLTreeBuilder, - STRICT, - ) - - -HTMLPARSER = 'html.parser' - -class BeautifulSoupHTMLParser(HTMLParser): - """A subclass of the Python standard library's HTMLParser class, which - listens for HTMLParser events and translates them into calls - to Beautiful Soup's tree construction API. - """ - - # Strategies for handling duplicate attributes - IGNORE = 'ignore' - REPLACE = 'replace' - - def __init__(self, *args, **kwargs): - """Constructor. - - :param on_duplicate_attribute: A strategy for what to do if a - tag includes the same attribute more than once. Accepted - values are: REPLACE (replace earlier values with later - ones, the default), IGNORE (keep the earliest value - encountered), or a callable. A callable must take three - arguments: the dictionary of attributes already processed, - the name of the duplicate attribute, and the most recent value - encountered. - """ - self.on_duplicate_attribute = kwargs.pop( - 'on_duplicate_attribute', self.REPLACE - ) - HTMLParser.__init__(self, *args, **kwargs) - - # Keep a list of empty-element tags that were encountered - # without an explicit closing tag. If we encounter a closing tag - # of this type, we'll associate it with one of those entries. - # - # This isn't a stack because we don't care about the - # order. It's a list of closing tags we've already handled and - # will ignore, assuming they ever show up. - self.already_closed_empty_element = [] - - def error(self, msg): - """In Python 3, HTMLParser subclasses must implement error(), although - this requirement doesn't appear to be documented. - - In Python 2, HTMLParser implements error() by raising an exception, - which we don't want to do. - - In any event, this method is called only on very strange - markup and our best strategy is to pretend it didn't happen - and keep going. - """ - warnings.warn(msg) - - def handle_startendtag(self, name, attrs): - """Handle an incoming empty-element tag. - - This is only called when the markup looks like . - - :param name: Name of the tag. - :param attrs: Dictionary of the tag's attributes. - """ - # is_startend() tells handle_starttag not to close the tag - # just because its name matches a known empty-element tag. We - # know that this is an empty-element tag and we want to call - # handle_endtag ourselves. - tag = self.handle_starttag(name, attrs, handle_empty_element=False) - self.handle_endtag(name) - - def handle_starttag(self, name, attrs, handle_empty_element=True): - """Handle an opening tag, e.g. '' - - :param name: Name of the tag. - :param attrs: Dictionary of the tag's attributes. - :param handle_empty_element: True if this tag is known to be - an empty-element tag (i.e. there is not expected to be any - closing tag). - """ - # XXX namespace - attr_dict = {} - for key, value in attrs: - # Change None attribute values to the empty string - # for consistency with the other tree builders. - if value is None: - value = '' - if key in attr_dict: - # A single attribute shows up multiple times in this - # tag. How to handle it depends on the - # on_duplicate_attribute setting. - on_dupe = self.on_duplicate_attribute - if on_dupe == self.IGNORE: - pass - elif on_dupe in (None, self.REPLACE): - attr_dict[key] = value - else: - on_dupe(attr_dict, key, value) - else: - attr_dict[key] = value - attrvalue = '""' - #print("START", name) - sourceline, sourcepos = self.getpos() - tag = self.soup.handle_starttag( - name, None, None, attr_dict, sourceline=sourceline, - sourcepos=sourcepos - ) - if tag and tag.is_empty_element and handle_empty_element: - # Unlike other parsers, html.parser doesn't send separate end tag - # events for empty-element tags. (It's handled in - # handle_startendtag, but only if the original markup looked like - # .) - # - # So we need to call handle_endtag() ourselves. Since we - # know the start event is identical to the end event, we - # don't want handle_endtag() to cross off any previous end - # events for tags of this name. - self.handle_endtag(name, check_already_closed=False) - - # But we might encounter an explicit closing tag for this tag - # later on. If so, we want to ignore it. - self.already_closed_empty_element.append(name) - - def handle_endtag(self, name, check_already_closed=True): - """Handle a closing tag, e.g. '' - - :param name: A tag name. - :param check_already_closed: True if this tag is expected to - be the closing portion of an empty-element tag, - e.g. ''. - """ - #print("END", name) - if check_already_closed and name in self.already_closed_empty_element: - # This is a redundant end tag for an empty-element tag. - # We've already called handle_endtag() for it, so just - # check it off the list. - #print("ALREADY CLOSED", name) - self.already_closed_empty_element.remove(name) - else: - self.soup.handle_endtag(name) - - def handle_data(self, data): - """Handle some textual data that shows up between tags.""" - self.soup.handle_data(data) - - def handle_charref(self, name): - """Handle a numeric character reference by converting it to the - corresponding Unicode character and treating it as textual - data. - - :param name: Character number, possibly in hexadecimal. - """ - # XXX workaround for a bug in HTMLParser. Remove this once - # it's fixed in all supported versions. - # http://bugs.python.org/issue13633 - if name.startswith('x'): - real_name = int(name.lstrip('x'), 16) - elif name.startswith('X'): - real_name = int(name.lstrip('X'), 16) - else: - real_name = int(name) - - data = None - if real_name < 256: - # HTML numeric entities are supposed to reference Unicode - # code points, but sometimes they reference code points in - # some other encoding (ahem, Windows-1252). E.g. “ - # instead of É for LEFT DOUBLE QUOTATION MARK. This - # code tries to detect this situation and compensate. - for encoding in (self.soup.original_encoding, 'windows-1252'): - if not encoding: - continue - try: - data = bytearray([real_name]).decode(encoding) - except UnicodeDecodeError as e: - pass - if not data: - try: - data = chr(real_name) - except (ValueError, OverflowError) as e: - pass - data = data or "\N{REPLACEMENT CHARACTER}" - self.handle_data(data) - - def handle_entityref(self, name): - """Handle a named entity reference by converting it to the - corresponding Unicode character(s) and treating it as textual - data. - - :param name: Name of the entity reference. - """ - character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) - if character is not None: - data = character - else: - # If this were XML, it would be ambiguous whether "&foo" - # was an character entity reference with a missing - # semicolon or the literal string "&foo". Since this is - # HTML, we have a complete list of all character entity references, - # and this one wasn't found, so assume it's the literal string "&foo". - data = "&%s" % name - self.handle_data(data) - - def handle_comment(self, data): - """Handle an HTML comment. - - :param data: The text of the comment. - """ - self.soup.endData() - self.soup.handle_data(data) - self.soup.endData(Comment) - - def handle_decl(self, data): - """Handle a DOCTYPE declaration. - - :param data: The text of the declaration. - """ - self.soup.endData() - data = data[len("DOCTYPE "):] - self.soup.handle_data(data) - self.soup.endData(Doctype) - - def unknown_decl(self, data): - """Handle a declaration of unknown type -- probably a CDATA block. - - :param data: The text of the declaration. - """ - if data.upper().startswith('CDATA['): - cls = CData - data = data[len('CDATA['):] - else: - cls = Declaration - self.soup.endData() - self.soup.handle_data(data) - self.soup.endData(cls) - - def handle_pi(self, data): - """Handle a processing instruction. - - :param data: The text of the instruction. - """ - self.soup.endData() - self.soup.handle_data(data) - self.soup.endData(ProcessingInstruction) - - -class HTMLParserTreeBuilder(HTMLTreeBuilder): - """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser, - found in the Python standard library. - """ - is_xml = False - picklable = True - NAME = HTMLPARSER - features = [NAME, HTML, STRICT] - - # The html.parser knows which line number and position in the - # original file is the source of an element. - TRACKS_LINE_NUMBERS = True - - def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): - """Constructor. - - :param parser_args: Positional arguments to pass into - the BeautifulSoupHTMLParser constructor, once it's - invoked. - :param parser_kwargs: Keyword arguments to pass into - the BeautifulSoupHTMLParser constructor, once it's - invoked. - :param kwargs: Keyword arguments for the superclass constructor. - """ - # Some keyword arguments will be pulled out of kwargs and placed - # into parser_kwargs. - extra_parser_kwargs = dict() - for arg in ('on_duplicate_attribute',): - if arg in kwargs: - value = kwargs.pop(arg) - extra_parser_kwargs[arg] = value - super(HTMLParserTreeBuilder, self).__init__(**kwargs) - parser_args = parser_args or [] - parser_kwargs = parser_kwargs or {} - parser_kwargs.update(extra_parser_kwargs) - if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: - parser_kwargs['strict'] = False - if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: - parser_kwargs['convert_charrefs'] = False - self.parser_args = (parser_args, parser_kwargs) - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None, exclude_encodings=None): - - """Run any preliminary steps necessary to make incoming markup - acceptable to the parser. - - :param markup: Some markup -- probably a bytestring. - :param user_specified_encoding: The user asked to try this encoding. - :param document_declared_encoding: The markup itself claims to be - in this encoding. - :param exclude_encodings: The user asked _not_ to try any of - these encodings. - - :yield: A series of 4-tuples: - (markup, encoding, declared encoding, - has undergone character replacement) - - Each 4-tuple represents a strategy for converting the - document to Unicode and parsing it. Each strategy will be tried - in turn. - """ - if isinstance(markup, str): - # Parse Unicode as-is. - yield (markup, None, None, False) - return - - # Ask UnicodeDammit to sniff the most likely encoding. - - # This was provided by the end-user; treat it as a known - # definite encoding per the algorithm laid out in the HTML5 - # spec. (See the EncodingDetector class for details.) - known_definite_encodings = [user_specified_encoding] - - # This was found in the document; treat it as a slightly lower-priority - # user encoding. - user_encodings = [document_declared_encoding] - - try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit( - markup, - known_definite_encodings=known_definite_encodings, - user_encodings=user_encodings, - is_html=True, - exclude_encodings=exclude_encodings - ) - yield (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) - - def feed(self, markup): - """Run some incoming markup through some parsing process, - populating the `BeautifulSoup` object in self.soup. - """ - args, kwargs = self.parser_args - parser = BeautifulSoupHTMLParser(*args, **kwargs) - parser.soup = self.soup - try: - parser.feed(markup) - parser.close() - except HTMLParseError as e: - warnings.warn(RuntimeWarning( - "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) - raise e - parser.already_closed_empty_element = [] - -# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some -# 3.2.3 code. This ensures they don't treat markup like
as a -# string. -# -# XXX This code can be removed once most Python 3 users are on 3.2.3. -if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: - import re - attrfind_tolerant = re.compile( - r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' - r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') - HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant - - locatestarttagend = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value - ) - )? - ) - )* - \s* # trailing whitespace -""", re.VERBOSE) - BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend - - from html.parser import tagfind, attrfind - - def parse_starttag(self, i): - self.__starttag_text = None - endpos = self.check_for_whole_start_tag(i) - if endpos < 0: - return endpos - rawdata = self.rawdata - self.__starttag_text = rawdata[i:endpos] - - # Now parse the data between i+1 and j into a tag and attrs - attrs = [] - match = tagfind.match(rawdata, i+1) - assert match, 'unexpected call to parse_starttag()' - k = match.end() - self.lasttag = tag = rawdata[i+1:k].lower() - while k < endpos: - if self.strict: - m = attrfind.match(rawdata, k) - else: - m = attrfind_tolerant.match(rawdata, k) - if not m: - break - attrname, rest, attrvalue = m.group(1, 2, 3) - if not rest: - attrvalue = None - elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ - attrvalue[:1] == '"' == attrvalue[-1:]: - attrvalue = attrvalue[1:-1] - if attrvalue: - attrvalue = self.unescape(attrvalue) - attrs.append((attrname.lower(), attrvalue)) - k = m.end() - - end = rawdata[k:endpos].strip() - if end not in (">", "/>"): - lineno, offset = self.getpos() - if "\n" in self.__starttag_text: - lineno = lineno + self.__starttag_text.count("\n") - offset = len(self.__starttag_text) \ - - self.__starttag_text.rfind("\n") - else: - offset = offset + len(self.__starttag_text) - if self.strict: - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) - self.handle_data(rawdata[i:endpos]) - return endpos - if end.endswith('/>'): - # XHTML-style empty tag: - self.handle_startendtag(tag, attrs) - else: - self.handle_starttag(tag, attrs) - if tag in self.CDATA_CONTENT_ELEMENTS: - self.set_cdata_mode(tag) - return endpos - - def set_cdata_mode(self, elem): - self.cdata_elem = elem.lower() - self.interesting = re.compile(r'' % self.cdata_elem, re.I) - - BeautifulSoupHTMLParser.parse_starttag = parse_starttag - BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode - - CONSTRUCTOR_TAKES_STRICT = True diff -Nru beautifulsoup4-4.10.0/bs4/builder/__init__.py beautifulsoup4-1.7.1-benchmark/bs4/builder/__init__.py --- beautifulsoup4-4.10.0/bs4/builder/__init__.py 2021-09-07 23:36:46.000000000 +0000 +++ beautifulsoup4-1.7.1-benchmark/bs4/builder/__init__.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,520 +0,0 @@ -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -from collections import defaultdict -import itertools -import sys -from bs4.element import ( - CharsetMetaAttributeValue, - ContentMetaAttributeValue, - Stylesheet, - Script, - TemplateString, - nonwhitespace_re -) - -__all__ = [ - 'HTMLTreeBuilder', - 'SAXTreeBuilder', - 'TreeBuilder', - 'TreeBuilderRegistry', - ] - -# Some useful features for a TreeBuilder to have. -FAST = 'fast' -PERMISSIVE = 'permissive' -STRICT = 'strict' -XML = 'xml' -HTML = 'html' -HTML_5 = 'html5' - - -class TreeBuilderRegistry(object): - """A way of looking up TreeBuilder subclasses by their name or by desired - features. - """ - - def __init__(self): - self.builders_for_feature = defaultdict(list) - self.builders = [] - - def register(self, treebuilder_class): - """Register a treebuilder based on its advertised features. - - :param treebuilder_class: A subclass of Treebuilder. its .features - attribute should list its features. - """ - for feature in treebuilder_class.features: - self.builders_for_feature[feature].insert(0, treebuilder_class) - self.builders.insert(0, treebuilder_class) - - def lookup(self, *features): - """Look up a TreeBuilder subclass with the desired features. - - :param features: A list of features to look for. If none are - provided, the most recently registered TreeBuilder subclass - will be used. - :return: A TreeBuilder subclass, or None if there's no - registered subclass with all the requested features. - """ - if len(self.builders) == 0: - # There are no builders at all. - return None - - if len(features) == 0: - # They didn't ask for any features. Give them the most - # recently registered builder. - return self.builders[0] - - # Go down the list of features in order, and eliminate any builders - # that don't match every feature. - features = list(features) - features.reverse() - candidates = None - candidate_set = None - while len(features) > 0: - feature = features.pop() - we_have_the_feature = self.builders_for_feature.get(feature, []) - if len(we_have_the_feature) > 0: - if candidates is None: - candidates = we_have_the_feature - candidate_set = set(candidates) - else: - # Eliminate any candidates that don't have this feature. - candidate_set = candidate_set.intersection( - set(we_have_the_feature)) - - # The only valid candidates are the ones in candidate_set. - # Go through the original list of candidates and pick the first one - # that's in candidate_set. - if candidate_set is None: - return None - for candidate in candidates: - if candidate in candidate_set: - return candidate - return None - -# The BeautifulSoup class will take feature lists from developers and use them -# to look up builders in this registry. -builder_registry = TreeBuilderRegistry() - -class TreeBuilder(object): - """Turn a textual document into a Beautiful Soup object tree.""" - - NAME = "[Unknown tree builder]" - ALTERNATE_NAMES = [] - features = [] - - is_xml = False - picklable = False - empty_element_tags = None # A tag will be considered an empty-element - # tag when and only when it has no contents. - - # A value for these tag/attribute combinations is a space- or - # comma-separated list of CDATA, rather than a single CDATA. - DEFAULT_CDATA_LIST_ATTRIBUTES = {} - - # Whitespace should be preserved inside these tags. - DEFAULT_PRESERVE_WHITESPACE_TAGS = set() - - # The textual contents of tags with these names should be - # instantiated with some class other than NavigableString. - DEFAULT_STRING_CONTAINERS = {} - - USE_DEFAULT = object() - - # Most parsers don't keep track of line numbers. - TRACKS_LINE_NUMBERS = False - - def __init__(self, multi_valued_attributes=USE_DEFAULT, - preserve_whitespace_tags=USE_DEFAULT, - store_line_numbers=USE_DEFAULT, - string_containers=USE_DEFAULT, - ): - """Constructor. - - :param multi_valued_attributes: If this is set to None, the - TreeBuilder will not turn any values for attributes like - 'class' into lists. Setting this to a dictionary will - customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES - for an example. - - Internally, these are called "CDATA list attributes", but that - probably doesn't make sense to an end-user, so the argument name - is `multi_valued_attributes`. - - :param preserve_whitespace_tags: A list of tags to treat - the way
tags are treated in HTML. Tags in this list - are immune from pretty-printing; their contents will always be - output as-is. - - :param string_containers: A dictionary mapping tag names to - the classes that should be instantiated to contain the textual - contents of those tags. The default is to use NavigableString - for every tag, no matter what the name. You can override the - default by changing DEFAULT_STRING_CONTAINERS. - - :param store_line_numbers: If the parser keeps track of the - line numbers and positions of the original markup, that - information will, by default, be stored in each corresponding - `Tag` object. You can turn this off by passing - store_line_numbers=False. If the parser you're using doesn't - keep track of this information, then setting store_line_numbers=True - will do nothing. - """ - self.soup = None - if multi_valued_attributes is self.USE_DEFAULT: - multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES - self.cdata_list_attributes = multi_valued_attributes - if preserve_whitespace_tags is self.USE_DEFAULT: - preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS - self.preserve_whitespace_tags = preserve_whitespace_tags - if store_line_numbers == self.USE_DEFAULT: - store_line_numbers = self.TRACKS_LINE_NUMBERS - self.store_line_numbers = store_line_numbers - if string_containers == self.USE_DEFAULT: - string_containers = self.DEFAULT_STRING_CONTAINERS - self.string_containers = string_containers - - def initialize_soup(self, soup): - """The BeautifulSoup object has been initialized and is now - being associated with the TreeBuilder. - - :param soup: A BeautifulSoup object. - """ - self.soup = soup - - def reset(self): - """Do any work necessary to reset the underlying parser - for a new document. - - By default, this does nothing. - """ - pass - - def can_be_empty_element(self, tag_name): - """Might a tag with this name be an empty-element tag? - - The final markup may or may not actually present this tag as - self-closing. - - For instance: an HTMLBuilder does not consider a
tag to be - an empty-element tag (it's not in - HTMLBuilder.empty_element_tags). This means an empty
tag - will be presented as "
", not "
" or "
". - - The default implementation has no opinion about which tags are - empty-element tags, so a tag will be presented as an - empty-element tag if and only if it has no children. - "" will become "", and "bar" will - be left alone. - - :param tag_name: The name of a markup tag. - """ - if self.empty_element_tags is None: - return True - return tag_name in self.empty_element_tags - - def feed(self, markup): - """Run some incoming markup through some parsing process, - populating the `BeautifulSoup` object in self.soup. - - This method is not implemented in TreeBuilder; it must be - implemented in subclasses. - - :return: None. - """ - raise NotImplementedError() - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None, exclude_encodings=None): - """Run any preliminary steps necessary to make incoming markup - acceptable to the parser. - - :param markup: Some markup -- probably a bytestring. - :param user_specified_encoding: The user asked to try this encoding. - :param document_declared_encoding: The markup itself claims to be - in this encoding. NOTE: This argument is not used by the - calling code and can probably be removed. - :param exclude_encodings: The user asked _not_ to try any of - these encodings. - - :yield: A series of 4-tuples: - (markup, encoding, declared encoding, - has undergone character replacement) - - Each 4-tuple represents a strategy for converting the - document to Unicode and parsing it. Each strategy will be tried - in turn. - - By default, the only strategy is to parse the markup - as-is. See `LXMLTreeBuilderForXML` and - `HTMLParserTreeBuilder` for implementations that take into - account the quirks of particular parsers. - """ - yield markup, None, None, False - - def test_fragment_to_document(self, fragment): - """Wrap an HTML fragment to make it look like a document. - - Different parsers do this differently. For instance, lxml - introduces an empty tag, and html5lib - doesn't. Abstracting this away lets us write simple tests - which run HTML fragments through the parser and compare the - results against other HTML fragments. - - This method should not be used outside of tests. - - :param fragment: A string -- fragment of HTML. - :return: A string -- a full HTML document. - """ - return fragment - - def set_up_substitutions(self, tag): - """Set up any substitutions that will need to be performed on - a `Tag` when it's output as a string. - - By default, this does nothing. See `HTMLTreeBuilder` for a - case where this is used. - - :param tag: A `Tag` - :return: Whether or not a substitution was performed. - """ - return False - - def _replace_cdata_list_attribute_values(self, tag_name, attrs): - """When an attribute value is associated with a tag that can - have multiple values for that attribute, convert the string - value to a list of strings. - - Basically, replaces class="foo bar" with class=["foo", "bar"] - - NOTE: This method modifies its input in place. - - :param tag_name: The name of a tag. - :param attrs: A dictionary containing the tag's attributes. - Any appropriate attribute values will be modified in place. - """ - if not attrs: - return attrs - if self.cdata_list_attributes: - universal = self.cdata_list_attributes.get('*', []) - tag_specific = self.cdata_list_attributes.get( - tag_name.lower(), None) - for attr in list(attrs.keys()): - if attr in universal or (tag_specific and attr in tag_specific): - # We have a "class"-type attribute whose string - # value is a whitespace-separated list of - # values. Split it into a list. - value = attrs[attr] - if isinstance(value, str): - values = nonwhitespace_re.findall(value) - else: - # html5lib sometimes calls setAttributes twice - # for the same tag when rearranging the parse - # tree. On the second call the attribute value - # here is already a list. If this happens, - # leave the value alone rather than trying to - # split it again. - values = value - attrs[attr] = values - return attrs - -class SAXTreeBuilder(TreeBuilder): - """A Beautiful Soup treebuilder that listens for SAX events. - - This is not currently used for anything, but it demonstrates - how a simple TreeBuilder would work. - """ - - def feed(self, markup): - raise NotImplementedError() - - def close(self): - pass - - def startElement(self, name, attrs): - attrs = dict((key[1], value) for key, value in list(attrs.items())) - #print("Start %s, %r" % (name, attrs)) - self.soup.handle_starttag(name, attrs) - - def endElement(self, name): - #print("End %s" % name) - self.soup.handle_endtag(name) - - def startElementNS(self, nsTuple, nodeName, attrs): - # Throw away (ns, nodeName) for now. - self.startElement(nodeName, attrs) - - def endElementNS(self, nsTuple, nodeName): - # Throw away (ns, nodeName) for now. - self.endElement(nodeName) - #handler.endElementNS((ns, node.nodeName), node.nodeName) - - def startPrefixMapping(self, prefix, nodeValue): - # Ignore the prefix for now. - pass - - def endPrefixMapping(self, prefix): - # Ignore the prefix for now. - # handler.endPrefixMapping(prefix) - pass - - def characters(self, content): - self.soup.handle_data(content) - - def startDocument(self): - pass - - def endDocument(self): - pass - - -class HTMLTreeBuilder(TreeBuilder): - """This TreeBuilder knows facts about HTML. - - Such as which tags are empty-element tags. - """ - - empty_element_tags = set([ - # These are from HTML5. - 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', - - # These are from earlier versions of HTML and are removed in HTML5. - 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' - ]) - - # The HTML standard defines these as block-level elements. Beautiful - # Soup does not treat these elements differently from other elements, - # but it may do so eventually, and this information is available if - # you need to use it. - block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) - - # The HTML standard defines an unusual content model for these tags. - # We represent this by using a string class other than NavigableString - # inside these tags. - # - # I made this list by going through the HTML spec - # (https://html.spec.whatwg.org/#metadata-content) and looking for - # "metadata content" elements that can contain strings. - # - # TODO: Arguably aftermath
' - soup = self.soup(markup) - noscript = soup.noscript - self.assertEqual("target", noscript.next_element) - target = soup.find(string='target') - - # The 'aftermath' string was duplicated; we want the second one. - final_aftermath = soup.find_all(string='aftermath')[-1] - - # The