diff --git a/.github/workflows/publish-to-test-pypi.yml b/.github/workflows/publish-to-test-pypi.yml new file mode 100644 index 0000000..fb134ac --- /dev/null +++ b/.github/workflows/publish-to-test-pypi.yml @@ -0,0 +1,95 @@ +name: Publish Python 🐍 distribution πŸ“¦ to PyPI and TestPyPI + +on: + push + +jobs: + test: + name: Run tests + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + persist-credentials: false + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Run tests + env: + PYTHONPATH: src + run: python3 -m unittest discover tests/ + + build: + name: Build distribution πŸ“¦ + runs-on: ubuntu-latest + needs: + - test + steps: + - uses: actions/checkout@v4 + with: + persist-credentials: false + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install pypa/build + run: >- + python3 -m + pip install + build + --user + - name: Build a binary wheel and a source tarball + run: python3 -m build + - name: Store the distribution packages + uses: actions/upload-artifact@v4 + with: + name: python-package-distributions + path: dist/ + + + publish-to-pypi: + name: >- + Publish Python 🐍 distribution πŸ“¦ to PyPI + if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes + needs: + - build + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/refinedoc + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing + steps: + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution πŸ“¦ to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + + publish-to-testpypi: + name: Publish Python 🐍 distribution πŸ“¦ to TestPyPI + needs: + - build + runs-on: ubuntu-latest + + environment: + name: testpypi + url: https://test.pypi.org/p/refinedoc + + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing + + steps: + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution πŸ“¦ to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ \ No newline at end of file diff --git a/README.md b/README.md index 4f358cf..a7d7c66 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,8 @@ You can install with pip ``` pip install refinedoc ``` -### Example +### Example (vanilla) + ```python from refinedoc.refined_document import RefinedDocument @@ -61,6 +62,27 @@ body = rd.body # [["lorem ipsum dolor sit amet", "consectetur adipiscing elit"], ["sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"], ["ut enim ad minim veniam quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat"], ["duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur"]] ``` +## Example (with pypdf) + +```python +from refinedoc.refined_document import RefinedDocument +from pypdf import PdfReader + +# Build the document from a PDF file +reader = PdfReader("path/to/your/pdf/file.pdf") +document = [] +for page in reader.pages: + document.append(page.extract_text().split("\n")) + +rd = RefinedDocument(content=document) +headers = rd.headers +# [["header 1", "subheader 1"], ["header 2", "subheader 2"], ["header 3", "subheader 3"], ["header 4", "subheader 4"]] +footers = rd.footers +# [["footer 1"], ["footer 2"], ["footer 3"], ["footer 4"]] +body = rd.body +# [["lorem ipsum dolor sit amet", "consectetur adipiscing elit"], ["sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"], ["ut enim ad minim veniam quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat"], ["duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur"]] +``` + ## How it's work My work is based on this paper : [Lin, Xiaofan. (2003). Header and Footer Extraction by Page-Association. 5010. 164-171. 10.1117/12.472833. ](https://www.researchgate.net/publication/221253782_Header_and_Footer_Extraction_by_Page-Association) diff --git a/pyproject.toml b/pyproject.toml index 559f64b..210fba6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "refinedoc" -version = "0.0.3" +version = "1.0.0" authors = [ { name="ThΓ©o NARDIN", email="theo.nardin@learningplanetinstitute.org" }, ]