Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 29 additions & 7 deletions demos/embed/sic_embedding_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,42 @@
"""

# %%
from industrial_classification_utils.embed import EmbeddingHandler
from importlib.resources import files

EXAMPLE_QUERY = "school teacher primary education"
from industrial_classification_utils.embed import (
EmbeddingHandler,
load_embedding_handler_from_sic_index_files,
)

DB_DIR = "./data/vector_store"

# %%
print("Creating embeddings index...")
print("Creating embeddings example index...")
# Create the embeddings index
embed = EmbeddingHandler(db_dir="./data/vector_store")

example_data = files("industrial_classification_utils.data.example").joinpath(
"toy_index.csv"
)
embed1 = EmbeddingHandler(db_dir=DB_DIR, index_source_file=str(example_data))
print(
f"Embeddings index created with {embed._index_size} entries." # pylint: disable=protected-access
f"Embeddings index created with {embed1.index_size} entries." # pylint: disable=protected-access
)

print("\nExample search for most 'loyal' in the toy index:")
print(embed1.search_index("loyal").model_dump_json(indent=2))


# %%
results = embed.search_index(EXAMPLE_QUERY)
print(f"Search results for '{EXAMPLE_QUERY}': {results}")
# Alternative loading method using large published sic indices (xlsx)
embed2 = load_embedding_handler_from_sic_index_files(db_dir=DB_DIR)
print(
f"Embeddings index created with {embed2.index_size} entries." # pylint: disable=protected-access
)

# %%
EXAMPLE_QUERY = "Primary education"
results = embed2.search_index(EXAMPLE_QUERY)

print(f"Results for query '{EXAMPLE_QUERY}':", results.model_dump_json(indent=2))

# %%
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ convention = "google"
"tests/*" = [
# Allow use of assert statements in tests
"S101",
# Allow magic values used in comparison in tests
"PLR2004",
]

[tool.ruff.format]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
label,text
01,"Crop and animal production, hunting and related service activities"
02,"Forestry and logging"
03,"Fishing and aquaculture"
05,"Mining of coal and lignite"
06,"Extraction of crude petroleum and natural gas"
07,"Mining of metal ores"
08,"Other mining and quarrying"
09,"Mining support service activities"
10,"Manufacture of food products"
11,"Manufacture of beverages"
12,"Manufacture of tobacco products"
13,"Manufacture of textiles"
14,"Manufacture of wearing apparel"
15,"Manufacture of leather and related products"
16,"Manufacture of wood and of products of wood and cork, except furniture; manufacture of articles of straw and plaiting materials"
17,"Manufacture of paper and paper products"
18,"Printing and reproduction of recorded media"
19,"Manufacture of coke and refined petroleum products"
20,"Manufacture of chemicals and chemical products"
21,"Manufacture of basic pharmaceutical products and pharmaceutical preparations"
22,"Manufacture of rubber and plastic products"
23,"Manufacture of other non-metallic mineral products"
24,"Manufacture of basic metals"
25,"Manufacture of fabricated metal products, except machinery and equipment"
26,"Manufacture of computer, electronic and optical products"
27,"Manufacture of electrical equipment"
28,"Manufacture of machinery and equipment nec"
29,"Manufacture of motor vehicles, trailers and semi-trailers"
30,"Manufacture of other transport equipment"
31,"Manufacture of furniture"
32,"Other manufacturing"
33,"Repair and installation of machinery and equipment"
35,"Electricity, gas, steam and air conditioning supply"
36,"Water collection, treatment and supply"
37,"Sewerage"
38,"Waste collection, treatment and disposal activities; materials recovery"
39,"Remediation activities and other waste management services"
41,"Construction of buildings"
42,"Civil engineering"
43,"Specialised construction activities"
45,"Wholesale and retail trade and repair of motor vehicles and motorcycles"
46,"Wholesale trade, except of motor vehicles and motorcycles"
47,"Retail trade, except of motor vehicles and motorcycles"
49,"Land transport and transport via pipelines"
50,"Water transport"
51,"Air transport"
52,"Warehousing and support activities for transportation"
53,"Postal and courier activities"
55,"Accommodation"
56,"Food and beverage service activities"
58,"Publishing activities"
59,"Motion picture, video and television programme production, sound recording and music publishing activities"
60,"Programming and broadcasting activities"
61,"Telecommunications"
62,"Computer programming, consultancy and related activities"
63,"Information service activities"
64,"Financial service activities, except insurance and pension funding"
65,"Insurance, reinsurance and pension funding, except compulsory social security"
66,"Activities auxiliary to financial services and insurance activities"
68,"Real estate activities"
69,"Legal and accounting activities"
70,"Activities of head offices; management consultancy activities"
71,"Architectural and engineering activities; technical testing and analysis"
72,"Scientific research and development"
73,"Advertising and market research"
74,"Other professional, scientific and technical activities"
75,"Veterinary activities"
77,"Rental and leasing activities"
78,"Employment activities"
79,"Travel agency, tour operator and other reservation service and related activities"
80,"Security and investigation activities"
81,"Services to buildings and landscape activities"
82,"Office administrative, office support and other business support activities"
84,"Public administration and defence; compulsory social security"
85,"Education"
86,"Human health activities"
87,"Residential care activities"
88,"Social work activities without accommodation"
90,"Creative, arts and entertainment activities"
91,"Libraries, archives, museums and other cultural activities"
92,"Gambling and betting activities"
93,"Sports activities and amusement and recreation activities"
94,"Activities of membership organisations"
95,"Repair of computers and personal and household goods"
96,"Other personal service activities"
97,"Activities of households as employers of domestic personnel"
98,"Undifferentiated goods- and services-producing activities of private households for own use"
99,"Activities of extraterritorial organisations and bodies"

This file was deleted.

Loading
Loading