In [23]:
%pip install openai scikit-learn

from google.colab import userdata

config = {}

# Copy some secrets into config
config.update(dict((k, userdata.get(k)) for k in [
    'openai_api_key'
]))

In [24]:
%pip install openai scikit-learn



Let's brainstorm a list of miscellaneous things:

In [25]:
items_text = """
- pasta
- thomas dolby
- alpha
- apples
- cats
- pears
- meters
- brick
- dogs
- beta
- howard jones
- concrete
- asphalt
- milk
- rebar
- gillian gilbert
- hamsters
- bread
- butter
- wendy carlos
- gamma
- birds
- bananas
- rick wakeman
- inches
- glass
- feet
- gary numan
- miles
- lumber
- kilometers
- geoff downes
"""

# Split the text into non-empty lines...
items = [x for x in items_text.split("\n") if x]

We're going to use OpenAI's services, so let's create a client with our API key:

In [26]:
from openai import OpenAI

openai_client = OpenAI(api_key=config["openai_api_key"])

Next, let's pick an embedding model and generate semantic vector representations for all our list items:

In [27]:
embeddings_response = openai_client.embeddings.create(
  model="text-embedding-ada-002",
  input=items,
  encoding_format="float"
)
embeddings = [embedding_item.embedding for embedding_item in embeddings_response.data]

Now that we have vectors, let's try clustering them within the semantic space of the model. This should be roughly analogous to grouping them by meaning:

In [28]:
from sklearn.cluster import KMeans
from itertools import groupby

# Let's say we want to organize the list into this many clusters
n_clusters = 12

# Use the k-means algorithm to come up with a cluster ID for each embedding
cluster_ids = KMeans(n_clusters=n_clusters, n_init='auto').fit_predict(embeddings)

# Associate each cluster ID with the corresponding item
cluster_ids_with_items = zip(cluster_ids, items)

# Group the pairs of (cluster_id, item) into lists based on cluster ID
grouped_cluster_ids_with_items = groupby(
    sorted(cluster_ids_with_items, key=lambda x: x[0]),
    key=lambda x: x[0]
)

# Simplify that whole mess so we just have a list of clustered items
clustered_items = [
    [item for cluster_id, item in item_group]
    for cluster_id, item_group
    in grouped_cluster_ids_with_items
]

clustered_items

[['- lumber'],
 ['- apples', '- pears', '- milk', '- butter', '- bananas'],
 ['- meters', '- miles', '- kilometers'],
 ['- alpha', '- beta', '- gamma'],
 ['- howard jones', '- geoff downes'],
 ['- brick', '- concrete', '- asphalt', '- rebar', '- glass'],
 ['- pasta', '- bread'],
 ['- cats', '- dogs', '- hamsters', '- birds'],
 ['- thomas dolby'],
 ['- wendy carlos', '- rick wakeman'],
 ['- gillian gilbert', '- gary numan'],
 ['- inches', '- feet']]

It's not perfect, but we've got our list roughly organized. Let's try coming up with a title for each cluster:

In [29]:
# for openai gpt-3.5-turbo
topic_generation_prompt = """
Please consider a list of items, one item per line.
From that list, produce a single concise label describing the entire list of items as a whole while avoiding the inclusion of the items.
The label should consist of fewer than 7 words.
Do not offer conversational preamble.
Do not explain the result.
Do not include any extraneous formatting or punctuation.
Thank you!
"""

def generate_topic(items):
    text = "\n".join(items)
    completion = openai_client.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "system", "content": topic_generation_prompt},
        {"role": "user", "content": text}
      ]
    )
    return completion.choices[0].message.content

for cluster in clustered_items:
    topic = generate_topic(cluster)

    print(f"# {topic}")
    print()
    for item in cluster:
        print(f"{item}")
    print()

# Building materials

- lumber

# Groceries

- apples
- pears
- milk
- butter
- bananas

# Units of Length

- meters
- miles
- kilometers

# Greek letters

- alpha
- beta
- gamma

# Synthpop musicians

- howard jones
- geoff downes

# Building materials

- brick
- concrete
- asphalt
- rebar
- glass

# Carbohydrate-rich foods

- pasta
- bread

# Household pets

- cats
- dogs
- hamsters
- birds

# Synthpop music artists

- thomas dolby

# Synthesizer music pioneers

- wendy carlos
- rick wakeman

# Synthpop Pioneers

- gillian gilbert
- gary numan

# Length measurements

- inches
- feet

