fix args, update README, remove extra files

2025-08-11 11:48:39 +08:00 · 2023-12-14 08:18:01 -08:00 · 2023-12-14 08:18:01 -08:00 · 1613e608a9
commit 1613e608a9
parent a8d4149147
4 changed files with 38 additions and 97 deletions
--- a/phi2/README.md
+++ b/phi2/README.md
@ -1,24 +1,48 @@
 # Phi-2

-Phi-2 is a 2.7B parameter model released by Microsoft and trained on a mixture of GPT-4 outputs and clean web-text.
-Its performance theoretically rivals much, much stronger models.
+Phi-2 is a 2.7B parameter model released by Microsoft[^1] and trained on a mixture
+of GPT-4 outputs and clean web-text. Its performance rivals
+much, much stronger models.

-## Downloading and Converting Weights
+## Setup 

-To download and convert the model:
+Download and convert the model:

 ```sh 
-python phi2/convert.py
+python convert.py
 ```

-That will fill in `weights/phi-2.npz`.
+which will make a file `weights.npz`.

-## Running the Model
+## Generate 

-🚧 (Not yet done) To run the model:
+To generate text with the default prompt:

 ```sh
-python phi2/generate.py
+python model.py
 ```

-Layer-by-layer forward pass outputs are currently shown in the outputs.txt files.
+Should give the output:
+
+```
+Answer: Mathematics is like a lighthouse that guides us through the darkness of
+uncertainty. Just as a lighthouse emits a steady beam of light, mathematics
+provides us with a clear path to navigate through complex problems. It
+illuminates our understanding and helps us make sense of the world around us.
+
+Exercise 2:
+Compare and contrast the role of logic in mathematics and the role of a compass
+in navigation.
+
+Answer: Logic in mathematics is like a compass in navigation. It helps
+```
+
+To use your own prompt:
+
+```sh
+python model.py --prompt <your prompt here> --max_tokens <max_token>
+```
+
+[^1]: For more details on the model see the [blog post](
+https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/)
+and the [Hugging Face repo](https://huggingface.co/microsoft/phi-2)
--- a/phi2/hf_model.py
+++ b/phi2/hf_model.py
@ -1,23 +0,0 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-
-if __name__ == "__main__":
-    model = AutoModelForCausalLM.from_pretrained(
-        "microsoft/phi-2", torch_dtype="auto", trust_remote_code=True
-    )
-    tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
-
-    inputs = tokenizer(
-        '''def print_prime(n):
-    """
-    Print all primes between 1 and n
-    """''',
-        return_tensors="pt",
-        return_attention_mask=False,
-    )
-
-    print(model(**inputs))
-
-    # outputs = model.generate(**inputs, max_length=200)
-    # text = tokenizer.batch_decode(outputs)[0]
-    # print(text)
--- a/phi2/model.py
+++ b/phi2/model.py
@ -203,11 +203,14 @@ if __name__ == "__main__":

    prompt = mx.array(prompt)

+    print("[INFO] Generating with Phi-2...", flush=True)
+    print(args.prompt, end="", flush=True)
+
    tokens = []
    for token, _ in zip(generate(prompt, model), range(args.max_tokens)):
        tokens.append(token)

-        if (len(tokens) % args.tokens_per_eval) == 0:
+        if (len(tokens) % 10) == 0:
            mx.eval(tokens)
            s = tokenizer.decode([t.item() for t in tokens])
            print(s, end="", flush=True)
--- a/phi2/phi2_outputs.txt
+++ b/phi2/phi2_outputs.txt
@ -1,63 +0,0 @@
-(HF) Output of Embeddings
-
-tensor([[[-0.0353,  0.0045,  0.0208,  ..., -0.0117,  0.0041,  0.0075],
-         [-0.0172,  0.0236, -0.0051,  ...,  0.0141,  0.0115,  0.0058],
-         [-0.0148,  0.0043, -0.0252,  ...,  0.0179,  0.0025, -0.0008],
-         ...,
-         [ 0.0003,  0.0051,  0.0002,  ...,  0.0043,  0.0075,  0.0049],
-         [-0.0110,  0.0472,  0.0030,  ...,  0.0098, -0.0075,  0.0146],
-         [-0.0085, -0.0219, -0.0016,  ..., -0.0059,  0.0109, -0.0016]]],
-       device='cuda:0', dtype=torch.float16, grad_fn=<EmbeddingBackward0>)
-
-(MLX) Output of Embeddings
-
-array([[[-0.0352783, 0.00445175, 0.020813, ..., -0.0117188, 0.00411606, 0.00748444],
-        [-0.0171509, 0.0236053, -0.00508881, ..., 0.0141144, 0.0115204, 0.00582504],
-        [-0.0147858, 0.00426102, -0.0252075, ..., 0.0179443, 0.0024662, -0.00076437],
-        ...,
-        [0.000337124, 0.00508499, 0.000193119, ..., 0.00427628, 0.00753403, 0.00492477],
-        [-0.0110092, 0.0472107, 0.00295448, ..., 0.00982666, -0.00747681, 0.0145721],
-        [-0.00852203, -0.0218964, -0.00161839, ..., -0.00592422, 0.0108643, -0.00162697]]], dtype=float16)
-
-(HF) Output of First Attention Layer
-
-tensor([[[-0.2000,  0.4849,  0.9863,  ..., -0.2209,  0.1355,  0.3469],
-         [ 0.4922, -0.3865,  0.8428,  ...,  0.5894, -0.0069, -0.5278],
-         [ 0.0902,  0.1028,  0.6826,  ...,  0.1394, -0.8145, -0.1880],
-         ...,
-         [ 0.2380,  0.0555, -0.3005,  ...,  0.0372, -0.0895,  0.0255],
-         [ 0.2512,  0.1949,  0.3401,  ...,  0.3625, -0.3103, -0.1064],
-         [-0.0905,  0.0665,  0.5210,  ..., -0.0767, -0.2460, -0.1449]]],
-       device='cuda:0', dtype=torch.float16, grad_fn=<AddBackward0>)
-torch.Size([1, 23, 2560])
-
-(MLX) Output of First Attention Layer
-
-array([[[-0.199973, 0.485224, 0.987237, ..., -0.220847, 0.13511, 0.346074],
-        [0.44883, -0.271683, 0.877478, ..., 0.653217, -0.0929724, -0.711176],
-        [-0.233398, 5.7824e-05, 0.435001, ..., 0.0504494, -0.623998, -0.438785],
-        ...,
-        [0.123587, -0.237459, -0.447518, ..., 0.0653363, -0.0767153, -0.341505],
-        [0.187798, 0.331209, 0.0827338, ..., 0.529453, -0.582141, -0.165316],
-        [-0.413614, 0.134572, 0.685769, ..., 0.0796088, 0.0217719, -0.118885]]], dtype=float32)
-[1, 23, 2560]
-
-(HF) Overall Output of Inputs:
-
-tensor([[[ 6.4688,  5.1016,  1.9658,  ..., -2.9043, -2.9043, -2.9043],
-         [ 5.2188,  6.4414,  5.1914,  ..., -0.1852, -0.1862, -0.1866],
-         [ 4.3516,  5.3281,  5.9922,  ..., -0.3689, -0.3699, -0.3696],
-         ...,
-         [10.4141, 11.7031, 12.5859,  ...,  0.7778,  0.7769,  0.7754],
-         [10.7188, 11.7891, 13.3125,  ...,  1.6123,  1.6113,  1.6104],
-         [10.8047, 12.0234, 12.4375,  ...,  0.2321,  0.2314,  0.2317]]],
-
-(MLX) Overall Output of Inputs:
-
-array([[[6.46632, 5.10102, 1.96306, ..., -2.90427, -2.90341, -2.90392],
-        [4.5092, 5.90938, 4.98036, ..., -0.411165, -0.412062, -0.412547],
-        [4.34246, 5.7794, 6.13245, ..., -0.40106, -0.402052, -0.401838],
-        ...,
-        [6.61827, 10.4022, 12.1672, ..., 0.602787, 0.602138, 0.600666],
-        [7.96546, 12.9569, 14.7947, ..., -0.347764, -0.348587, -0.34937],
-        [8.22272, 10.6631, 11.5968, ..., -1.12037, -1.12025, -1.12152]]], dtype=float32)